xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4302210d98c281cf4e8bc0fee406cf222e373a4c)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18 #include <cooperative_groups.h>
19 #endif
20 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
24 
25   typedef enum {
26       CUSPARSE_MV_ALG_DEFAULT = 0,
27       CUSPARSE_COOMV_ALG      = 1,
28       CUSPARSE_CSRMV_ALG1     = 2,
29       CUSPARSE_CSRMV_ALG2     = 3
30   } cusparseSpMVAlg_t;
31 
32   typedef enum {
33       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
35       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
36       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
37       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
38       CUSPARSE_SPMM_ALG_DEFAULT = 0,
39       CUSPARSE_SPMM_COO_ALG1    = 1,
40       CUSPARSE_SPMM_COO_ALG2    = 2,
41       CUSPARSE_SPMM_COO_ALG3    = 3,
42       CUSPARSE_SPMM_COO_ALG4    = 5,
43       CUSPARSE_SPMM_CSR_ALG1    = 4,
44       CUSPARSE_SPMM_CSR_ALG2    = 6,
45   } cusparseSpMMAlg_t;
46 
47   typedef enum {
48       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
50   } cusparseCsr2CscAlg_t;
51   */
52   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55 #endif
56 
57 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
60 
61 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
92 
93 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
94 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
95 
96 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
97 
98 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99 {
100   cusparseStatus_t   stat;
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102 
103   PetscFunctionBegin;
104   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105   cusparsestruct->stream = stream;
106   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107   PetscFunctionReturn(0);
108 }
109 
110 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111 {
112   cusparseStatus_t   stat;
113   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114 
115   PetscFunctionBegin;
116   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117   if (cusparsestruct->handle != handle) {
118     if (cusparsestruct->handle) {
119       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120     }
121     cusparsestruct->handle = handle;
122   }
123   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128 {
129   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130   PetscBool          flg;
131   PetscErrorCode     ierr;
132 
133   PetscFunctionBegin;
134   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
135   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
136   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137   PetscFunctionReturn(0);
138 }
139 
140 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141 {
142   PetscFunctionBegin;
143   *type = MATSOLVERCUSPARSE;
144   PetscFunctionReturn(0);
145 }
146 
147 /*MC
148   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153   algorithms are not recommended. This class does NOT support direct solver operations.
154 
155   Level: beginner
156 
157 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 M*/
159 
160 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161 {
162   PetscErrorCode ierr;
163   PetscInt       n = A->rmap->n;
164 
165   PetscFunctionBegin;
166   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
167   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
168   (*B)->factortype = ftype;
169   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
170 
171   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
172     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
173     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
179     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
180     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
181     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
182     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
183   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
184 
185   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
186   (*B)->canuseordering = PETSC_TRUE;
187   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
188   PetscFunctionReturn(0);
189 }
190 
191 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192 {
193   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
194 
195   PetscFunctionBegin;
196   switch (op) {
197   case MAT_CUSPARSE_MULT:
198     cusparsestruct->format = format;
199     break;
200   case MAT_CUSPARSE_ALL:
201     cusparsestruct->format = format;
202     break;
203   default:
204     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
205   }
206   PetscFunctionReturn(0);
207 }
208 
209 /*@
210    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
211    operation. Only the MatMult operation can use different GPU storage formats
212    for MPIAIJCUSPARSE matrices.
213    Not Collective
214 
215    Input Parameters:
216 +  A - Matrix of type SEQAIJCUSPARSE
217 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
218 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
219 
220    Output Parameter:
221 
222    Level: intermediate
223 
224 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
225 @*/
226 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
232   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
233   PetscFunctionReturn(0);
234 }
235 
236 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
237 {
238   PetscErrorCode ierr;
239 
240   PetscFunctionBegin;
241   switch (op) {
242     case MAT_FORM_EXPLICIT_TRANSPOSE:
243       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
244       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
245       A->form_explicit_transpose = flg;
246       break;
247     default:
248       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
249       break;
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
255 
256 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
257 {
258   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
259   IS             isrow = b->row,iscol = b->col;
260   PetscBool      row_identity,col_identity;
261   PetscErrorCode ierr;
262 
263   PetscFunctionBegin;
264   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
265   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
266   B->offloadmask = PETSC_OFFLOAD_CPU;
267   /* determine which version of MatSolve needs to be used. */
268   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
269   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
270   if (row_identity && col_identity) {
271     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
272     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
273     B->ops->matsolve = NULL;
274     B->ops->matsolvetranspose = NULL;
275   } else {
276     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
277     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   }
281 
282   /* get the triangular factors */
283   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
284   PetscFunctionReturn(0);
285 }
286 
287 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
288 {
289   PetscErrorCode           ierr;
290   MatCUSPARSEStorageFormat format;
291   PetscBool                flg;
292   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
293 
294   PetscFunctionBegin;
295   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
296   if (A->factortype == MAT_FACTOR_NONE) {
297     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
298                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
299     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
300 
301     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
302                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
303     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
304    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
305     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
306                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
307     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
308     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
309 
310     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
311                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
312     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
313 
314     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
315                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
316     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
317    #endif
318   }
319   ierr = PetscOptionsTail();CHKERRQ(ierr);
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
324 {
325   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
326   PetscErrorCode               ierr;
327 
328   PetscFunctionBegin;
329   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
330   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
331   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
336 {
337   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
338   PetscErrorCode               ierr;
339 
340   PetscFunctionBegin;
341   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
342   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
343   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
344   PetscFunctionReturn(0);
345 }
346 
347 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
348 {
349   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
350   PetscErrorCode               ierr;
351 
352   PetscFunctionBegin;
353   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
354   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
355   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
356   PetscFunctionReturn(0);
357 }
358 
359 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
360 {
361   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
362   PetscErrorCode               ierr;
363 
364   PetscFunctionBegin;
365   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
366   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
367   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
368   PetscFunctionReturn(0);
369 }
370 
371 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
372 {
373   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
374   PetscInt                          n = A->rmap->n;
375   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
376   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
377   cusparseStatus_t                  stat;
378   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
379   const MatScalar                   *aa = a->a,*v;
380   PetscInt                          *AiLo, *AjLo;
381   PetscInt                          i,nz, nzLower, offset, rowOffset;
382   PetscErrorCode                    ierr;
383   cudaError_t                       cerr;
384 
385   PetscFunctionBegin;
386   if (!n) PetscFunctionReturn(0);
387   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
388     try {
389       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
390       nzLower=n+ai[n]-ai[1];
391       if (!loTriFactor) {
392         PetscScalar                       *AALo;
393 
394         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
395 
396         /* Allocate Space for the lower triangular matrix */
397         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
398         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
399 
400         /* Fill the lower triangular matrix */
401         AiLo[0]  = (PetscInt) 0;
402         AiLo[n]  = nzLower;
403         AjLo[0]  = (PetscInt) 0;
404         AALo[0]  = (MatScalar) 1.0;
405         v        = aa;
406         vi       = aj;
407         offset   = 1;
408         rowOffset= 1;
409         for (i=1; i<n; i++) {
410           nz = ai[i+1] - ai[i];
411           /* additional 1 for the term on the diagonal */
412           AiLo[i]    = rowOffset;
413           rowOffset += nz+1;
414 
415           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
416           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
417 
418           offset      += nz;
419           AjLo[offset] = (PetscInt) i;
420           AALo[offset] = (MatScalar) 1.0;
421           offset      += 1;
422 
423           v  += nz;
424           vi += nz;
425         }
426 
427         /* allocate space for the triangular factor information */
428         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
429         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
430         /* Create the matrix description */
431         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
432         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
433        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
435        #else
436         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
437        #endif
438         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
439         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
440 
441         /* set the operation */
442         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443 
444         /* set the matrix */
445         loTriFactor->csrMat = new CsrMatrix;
446         loTriFactor->csrMat->num_rows = n;
447         loTriFactor->csrMat->num_cols = n;
448         loTriFactor->csrMat->num_entries = nzLower;
449 
450         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
451         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
452 
453         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
454         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
455 
456         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
457         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
458 
459         /* Create the solve analysis information */
460         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
461         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
462       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
463         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
464                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
465                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
466                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
467                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
468         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
469       #endif
470 
471         /* perform the solve analysis */
472         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
473                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
474                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
475                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
476                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
477                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
478                                #endif
479 );CHKERRCUSPARSE(stat);
480         cerr = WaitForCUDA();CHKERRCUDA(cerr);
481         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
482 
483         /* assign the pointer */
484         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
485         loTriFactor->AA_h = AALo;
486         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
487         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
488         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
489       } else { /* update values only */
490         if (!loTriFactor->AA_h) {
491           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
492         }
493         /* Fill the lower triangular matrix */
494         loTriFactor->AA_h[0]  = 1.0;
495         v        = aa;
496         vi       = aj;
497         offset   = 1;
498         for (i=1; i<n; i++) {
499           nz = ai[i+1] - ai[i];
500           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
501           offset      += nz;
502           loTriFactor->AA_h[offset] = 1.0;
503           offset      += 1;
504           v  += nz;
505         }
506         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
507         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
508       }
509     } catch(char *ex) {
510       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
511     }
512   }
513   PetscFunctionReturn(0);
514 }
515 
516 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
517 {
518   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
519   PetscInt                          n = A->rmap->n;
520   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
521   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
522   cusparseStatus_t                  stat;
523   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
524   const MatScalar                   *aa = a->a,*v;
525   PetscInt                          *AiUp, *AjUp;
526   PetscInt                          i,nz, nzUpper, offset;
527   PetscErrorCode                    ierr;
528   cudaError_t                       cerr;
529 
530   PetscFunctionBegin;
531   if (!n) PetscFunctionReturn(0);
532   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
533     try {
534       /* next, figure out the number of nonzeros in the upper triangular matrix. */
535       nzUpper = adiag[0]-adiag[n];
536       if (!upTriFactor) {
537         PetscScalar *AAUp;
538 
539         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
540 
541         /* Allocate Space for the upper triangular matrix */
542         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
543         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
544 
545         /* Fill the upper triangular matrix */
546         AiUp[0]=(PetscInt) 0;
547         AiUp[n]=nzUpper;
548         offset = nzUpper;
549         for (i=n-1; i>=0; i--) {
550           v  = aa + adiag[i+1] + 1;
551           vi = aj + adiag[i+1] + 1;
552 
553           /* number of elements NOT on the diagonal */
554           nz = adiag[i] - adiag[i+1]-1;
555 
556           /* decrement the offset */
557           offset -= (nz+1);
558 
559           /* first, set the diagonal elements */
560           AjUp[offset] = (PetscInt) i;
561           AAUp[offset] = (MatScalar)1./v[nz];
562           AiUp[i]      = AiUp[i+1] - (nz+1);
563 
564           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
565           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
566         }
567 
568         /* allocate space for the triangular factor information */
569         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
570         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
571 
572         /* Create the matrix description */
573         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
574         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
575        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
576         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
577        #else
578         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
579        #endif
580         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
581         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
582 
583         /* set the operation */
584         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
585 
586         /* set the matrix */
587         upTriFactor->csrMat = new CsrMatrix;
588         upTriFactor->csrMat->num_rows = n;
589         upTriFactor->csrMat->num_cols = n;
590         upTriFactor->csrMat->num_entries = nzUpper;
591 
592         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
593         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
594 
595         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
596         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
597 
598         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
599         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
600 
601         /* Create the solve analysis information */
602         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
603         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
604       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
605         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
606                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
607                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
608                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
609                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
610         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
611       #endif
612 
613         /* perform the solve analysis */
614         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
615                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
616                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
617                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
618                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
619                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
620                                #endif
621 );CHKERRCUSPARSE(stat);
622         cerr = WaitForCUDA();CHKERRCUDA(cerr);
623         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
624 
625         /* assign the pointer */
626         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
627         upTriFactor->AA_h = AAUp;
628         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
629         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
630         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
631       } else {
632         if (!upTriFactor->AA_h) {
633           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
634         }
635         /* Fill the upper triangular matrix */
636         offset = nzUpper;
637         for (i=n-1; i>=0; i--) {
638           v  = aa + adiag[i+1] + 1;
639 
640           /* number of elements NOT on the diagonal */
641           nz = adiag[i] - adiag[i+1]-1;
642 
643           /* decrement the offset */
644           offset -= (nz+1);
645 
646           /* first, set the diagonal elements */
647           upTriFactor->AA_h[offset] = 1./v[nz];
648           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
649         }
650         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
651         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
652       }
653     } catch(char *ex) {
654       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
655     }
656   }
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
661 {
662   PetscErrorCode               ierr;
663   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
664   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
665   IS                           isrow = a->row,iscol = a->icol;
666   PetscBool                    row_identity,col_identity;
667   PetscInt                     n = A->rmap->n;
668 
669   PetscFunctionBegin;
670   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
671   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
672   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
673 
674   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
675   cusparseTriFactors->nnz=a->nz;
676 
677   A->offloadmask = PETSC_OFFLOAD_BOTH;
678   /* lower triangular indices */
679   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
680   if (!row_identity && !cusparseTriFactors->rpermIndices) {
681     const PetscInt *r;
682 
683     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
684     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
685     cusparseTriFactors->rpermIndices->assign(r, r+n);
686     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
687     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
688   }
689 
690   /* upper triangular indices */
691   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
692   if (!col_identity && !cusparseTriFactors->cpermIndices) {
693     const PetscInt *c;
694 
695     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
696     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
697     cusparseTriFactors->cpermIndices->assign(c, c+n);
698     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
699     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
700   }
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
705 {
706   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
707   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
708   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
709   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
710   cusparseStatus_t                  stat;
711   PetscErrorCode                    ierr;
712   cudaError_t                       cerr;
713   PetscInt                          *AiUp, *AjUp;
714   PetscScalar                       *AAUp;
715   PetscScalar                       *AALo;
716   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
717   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
718   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
719   const MatScalar                   *aa = b->a,*v;
720 
721   PetscFunctionBegin;
722   if (!n) PetscFunctionReturn(0);
723   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
724     try {
725       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
726       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727       if (!upTriFactor && !loTriFactor) {
728         /* Allocate Space for the upper triangular matrix */
729         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
730         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
731 
732         /* Fill the upper triangular matrix */
733         AiUp[0]=(PetscInt) 0;
734         AiUp[n]=nzUpper;
735         offset = 0;
736         for (i=0; i<n; i++) {
737           /* set the pointers */
738           v  = aa + ai[i];
739           vj = aj + ai[i];
740           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
741 
742           /* first, set the diagonal elements */
743           AjUp[offset] = (PetscInt) i;
744           AAUp[offset] = (MatScalar)1.0/v[nz];
745           AiUp[i]      = offset;
746           AALo[offset] = (MatScalar)1.0/v[nz];
747 
748           offset+=1;
749           if (nz>0) {
750             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
751             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
752             for (j=offset; j<offset+nz; j++) {
753               AAUp[j] = -AAUp[j];
754               AALo[j] = AAUp[j]/v[nz];
755             }
756             offset+=nz;
757           }
758         }
759 
760         /* allocate space for the triangular factor information */
761         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
762         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
763 
764         /* Create the matrix description */
765         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
766         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
767        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
768         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
769        #else
770         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
771        #endif
772         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
773         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
774 
775         /* set the matrix */
776         upTriFactor->csrMat = new CsrMatrix;
777         upTriFactor->csrMat->num_rows = A->rmap->n;
778         upTriFactor->csrMat->num_cols = A->cmap->n;
779         upTriFactor->csrMat->num_entries = a->nz;
780 
781         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
782         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
783 
784         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
785         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
786 
787         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
788         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
789 
790         /* set the operation */
791         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
792 
793         /* Create the solve analysis information */
794         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
795         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
796       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
797         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
798                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
799                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
800                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
801                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
802         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
803       #endif
804 
805         /* perform the solve analysis */
806         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
807                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
808                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
809                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
810                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
811                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
812                                 #endif
813 );CHKERRCUSPARSE(stat);
814         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816 
817         /* assign the pointer */
818         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the operation */
836         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837 
838         /* set the matrix */
839         loTriFactor->csrMat = new CsrMatrix;
840         loTriFactor->csrMat->num_rows = A->rmap->n;
841         loTriFactor->csrMat->num_cols = A->cmap->n;
842         loTriFactor->csrMat->num_entries = a->nz;
843 
844         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
872                                 #endif
873 );CHKERRCUSPARSE(stat);
874         cerr = WaitForCUDA();CHKERRCUDA(cerr);
875         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
876 
877         /* assign the pointer */
878         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
879 
880         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
881         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
882         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
883       } else {
884         /* Fill the upper triangular matrix */
885         offset = 0;
886         for (i=0; i<n; i++) {
887           /* set the pointers */
888           v  = aa + ai[i];
889           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
890 
891           /* first, set the diagonal elements */
892           AAUp[offset] = 1.0/v[nz];
893           AALo[offset] = 1.0/v[nz];
894 
895           offset+=1;
896           if (nz>0) {
897             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
898             for (j=offset; j<offset+nz; j++) {
899               AAUp[j] = -AAUp[j];
900               AALo[j] = AAUp[j]/v[nz];
901             }
902             offset+=nz;
903           }
904         }
905         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
906         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
908         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
909         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
910       }
911       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
912       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
913     } catch(char *ex) {
914       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
915     }
916   }
917   PetscFunctionReturn(0);
918 }
919 
920 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
921 {
922   PetscErrorCode               ierr;
923   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
924   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925   IS                           ip = a->row;
926   PetscBool                    perm_identity;
927   PetscInt                     n = A->rmap->n;
928 
929   PetscFunctionBegin;
930   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
931   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
932   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
933   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
934 
935   A->offloadmask = PETSC_OFFLOAD_BOTH;
936 
937   /* lower triangular indices */
938   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
939   if (!perm_identity) {
940     IS             iip;
941     const PetscInt *irip,*rip;
942 
943     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
944     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
945     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
946     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
947     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
948     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
950     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
951     ierr = ISDestroy(&iip);CHKERRQ(ierr);
952     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
953     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
954   }
955   PetscFunctionReturn(0);
956 }
957 
958 #define CHECK_LAUNCH_ERROR()                                                             \
959 do {                                                                                     \
960   /* Check synchronous errors, i.e. pre-launch */                                        \
961   cudaError_t err = cudaGetLastError();                                                  \
962   if (cudaSuccess != err) {                                                              \
963     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964   }                                                                                      \
965   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
966   err = cudaDeviceSynchronize();                                                         \
967   if (cudaSuccess != err) {                                                              \
968     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
969   }                                                                                      \
970  } while (0)
971 
972 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
973 {
974   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
975   IS             ip = b->row;
976   PetscBool      perm_identity;
977   PetscErrorCode ierr;
978 
979   PetscFunctionBegin;
980   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
981   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
982   B->offloadmask = PETSC_OFFLOAD_CPU;
983   /* determine which version of MatSolve needs to be used. */
984   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
985   if (perm_identity) {
986     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
987     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
988     B->ops->matsolve = NULL;
989     B->ops->matsolvetranspose = NULL;
990   } else {
991     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
992     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
993     B->ops->matsolve = NULL;
994     B->ops->matsolvetranspose = NULL;
995   }
996 
997   /* get the triangular factors */
998   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
999   PetscFunctionReturn(0);
1000 }
1001 
1002 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1003 {
1004   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1006   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1007   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1008   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1009   cusparseStatus_t                  stat;
1010   cusparseIndexBase_t               indexBase;
1011   cusparseMatrixType_t              matrixType;
1012   cusparseFillMode_t                fillMode;
1013   cusparseDiagType_t                diagType;
1014   cudaError_t                       cerr;
1015   PetscErrorCode                    ierr;
1016 
1017   PetscFunctionBegin;
1018   /* allocate space for the transpose of the lower triangular factor */
1019   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1020   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1021 
1022   /* set the matrix descriptors of the lower triangular factor */
1023   matrixType = cusparseGetMatType(loTriFactor->descr);
1024   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1025   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1026     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1027   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1028 
1029   /* Create the matrix description */
1030   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1031   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1032   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1033   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1034   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1035 
1036   /* set the operation */
1037   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1038 
1039   /* allocate GPU space for the CSC of the lower triangular factor*/
1040   loTriFactorT->csrMat = new CsrMatrix;
1041   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1042   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1043   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1044   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1045   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1046   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1047 
1048   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1049 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1050   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1051                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1052                                        loTriFactor->csrMat->values->data().get(),
1053                                        loTriFactor->csrMat->row_offsets->data().get(),
1054                                        loTriFactor->csrMat->column_indices->data().get(),
1055                                        loTriFactorT->csrMat->values->data().get(),
1056                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1057                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1058                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1059   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1060 #endif
1061 
1062   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1063   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1064                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1065                           loTriFactor->csrMat->values->data().get(),
1066                           loTriFactor->csrMat->row_offsets->data().get(),
1067                           loTriFactor->csrMat->column_indices->data().get(),
1068                           loTriFactorT->csrMat->values->data().get(),
1069                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1071                           CUSPARSE_ACTION_NUMERIC, indexBase,
1072                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1073                         #else
1074                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                           CUSPARSE_ACTION_NUMERIC, indexBase
1076                         #endif
1077 );CHKERRCUSPARSE(stat);
1078   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1079   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1080 
1081   /* Create the solve analysis information */
1082   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1083   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1084 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1085   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1086                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1087                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1088                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1089                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1090   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1091 #endif
1092 
1093   /* perform the solve analysis */
1094   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1095                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1096                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1097                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1098                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1099                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1100                           #endif
1101 );CHKERRCUSPARSE(stat);
1102   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1103   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1104 
1105   /* assign the pointer */
1106   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1107 
1108   /*********************************************/
1109   /* Now the Transpose of the Upper Tri Factor */
1110   /*********************************************/
1111 
1112   /* allocate space for the transpose of the upper triangular factor */
1113   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1114   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1115 
1116   /* set the matrix descriptors of the upper triangular factor */
1117   matrixType = cusparseGetMatType(upTriFactor->descr);
1118   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1119   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1120     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1121   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1122 
1123   /* Create the matrix description */
1124   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1125   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1126   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1127   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1128   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1129 
1130   /* set the operation */
1131   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1132 
1133   /* allocate GPU space for the CSC of the upper triangular factor*/
1134   upTriFactorT->csrMat = new CsrMatrix;
1135   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1136   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1137   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1138   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1139   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1140   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1141 
1142   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1145                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1146                                 upTriFactor->csrMat->values->data().get(),
1147                                 upTriFactor->csrMat->row_offsets->data().get(),
1148                                 upTriFactor->csrMat->column_indices->data().get(),
1149                                 upTriFactorT->csrMat->values->data().get(),
1150                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1151                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1152                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1153   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1154 #endif
1155 
1156   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1157   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1158                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1159                           upTriFactor->csrMat->values->data().get(),
1160                           upTriFactor->csrMat->row_offsets->data().get(),
1161                           upTriFactor->csrMat->column_indices->data().get(),
1162                           upTriFactorT->csrMat->values->data().get(),
1163                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1165                           CUSPARSE_ACTION_NUMERIC, indexBase,
1166                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1167                         #else
1168                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                           CUSPARSE_ACTION_NUMERIC, indexBase
1170                         #endif
1171 );CHKERRCUSPARSE(stat);
1172   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1173   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1174 
1175   /* Create the solve analysis information */
1176   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1177   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1178   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1179   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1180                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1181                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1182                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1183                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1184   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1185   #endif
1186 
1187   /* perform the solve analysis */
1188   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1189                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1190                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1191                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1192                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1193                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1194                           #endif
1195 );CHKERRCUSPARSE(stat);
1196   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1197   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1198 
1199   /* assign the pointer */
1200   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1201   PetscFunctionReturn(0);
1202 }
1203 
1204 struct PetscScalarToPetscInt
1205 {
1206   __host__ __device__
1207   PetscInt operator()(PetscScalar s)
1208   {
1209     return (PetscInt)PetscRealPart(s);
1210   }
1211 };
1212 
1213 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1214 {
1215   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1216   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1217   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1218   cusparseStatus_t             stat;
1219   cusparseIndexBase_t          indexBase;
1220   cudaError_t                  err;
1221   PetscErrorCode               ierr;
1222 
1223   PetscFunctionBegin;
1224   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1225   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1226   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1227   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1228   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1229   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1230   if (A->transupdated) PetscFunctionReturn(0);
1231   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1232   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1233     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1234   }
1235   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1238     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1240     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1241 
1242     /* set alpha and beta */
1243     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1244     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1245     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1246     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1247     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1248     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249 
1250     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251       CsrMatrix *matrixT = new CsrMatrix;
1252       matstructT->mat = matrixT;
1253       matrixT->num_rows = A->cmap->n;
1254       matrixT->num_cols = A->rmap->n;
1255       matrixT->num_entries = a->nz;
1256       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1257       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258       matrixT->values = new THRUSTARRAY(a->nz);
1259 
1260       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1261       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1262 
1263      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1264       stat = cusparseCreateCsr(&matstructT->matDescr,
1265                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1266                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1267                                matrixT->values->data().get(),
1268                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1269                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1270      #endif
1271     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1272    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1274    #else
1275       CsrMatrix *temp  = new CsrMatrix;
1276       CsrMatrix *tempT = new CsrMatrix;
1277       /* First convert HYB to CSR */
1278       temp->num_rows = A->rmap->n;
1279       temp->num_cols = A->cmap->n;
1280       temp->num_entries = a->nz;
1281       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1282       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1283       temp->values = new THRUSTARRAY(a->nz);
1284 
1285       stat = cusparse_hyb2csr(cusparsestruct->handle,
1286                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1287                               temp->values->data().get(),
1288                               temp->row_offsets->data().get(),
1289                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1290 
1291       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1292       tempT->num_rows = A->rmap->n;
1293       tempT->num_cols = A->cmap->n;
1294       tempT->num_entries = a->nz;
1295       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1296       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1297       tempT->values = new THRUSTARRAY(a->nz);
1298 
1299       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1300                               temp->num_cols, temp->num_entries,
1301                               temp->values->data().get(),
1302                               temp->row_offsets->data().get(),
1303                               temp->column_indices->data().get(),
1304                               tempT->values->data().get(),
1305                               tempT->column_indices->data().get(),
1306                               tempT->row_offsets->data().get(),
1307                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1308 
1309       /* Last, convert CSC to HYB */
1310       cusparseHybMat_t hybMat;
1311       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1312       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1313         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1314       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1315                               matstructT->descr, tempT->values->data().get(),
1316                               tempT->row_offsets->data().get(),
1317                               tempT->column_indices->data().get(),
1318                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1319 
1320       /* assign the pointer */
1321       matstructT->mat = hybMat;
1322       A->transupdated = PETSC_TRUE;
1323       /* delete temporaries */
1324       if (tempT) {
1325         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1326         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1327         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1328         delete (CsrMatrix*) tempT;
1329       }
1330       if (temp) {
1331         if (temp->values) delete (THRUSTARRAY*) temp->values;
1332         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1333         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1334         delete (CsrMatrix*) temp;
1335       }
1336      #endif
1337     }
1338   }
1339   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1340     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1341     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1342     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1343     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1344     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1345     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1346     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1347     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1348     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1349     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1350     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1351       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1352       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1353       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1354     }
1355     if (!cusparsestruct->csr2csc_i) {
1356       THRUSTARRAY csr2csc_a(matrix->num_entries);
1357       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1358 
1359       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1360      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1361       void   *csr2cscBuffer;
1362       size_t csr2cscBufferSize;
1363       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1364                                            A->cmap->n, matrix->num_entries,
1365                                            matrix->values->data().get(),
1366                                            cusparsestruct->rowoffsets_gpu->data().get(),
1367                                            matrix->column_indices->data().get(),
1368                                            matrixT->values->data().get(),
1369                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1370                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1371                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1372       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1373      #endif
1374 
1375       if (matrix->num_entries) {
1376         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1377            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1378            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1379 
1380            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1381            should be filled with indexBase. So I just take a shortcut here.
1382         */
1383         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1384                               A->cmap->n,matrix->num_entries,
1385                               csr2csc_a.data().get(),
1386                               cusparsestruct->rowoffsets_gpu->data().get(),
1387                               matrix->column_indices->data().get(),
1388                               matrixT->values->data().get(),
1389                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1390                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1391                               CUSPARSE_ACTION_NUMERIC,indexBase,
1392                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1393                              #else
1394                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1395                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1396                              #endif
1397       } else {
1398         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1399       }
1400 
1401       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1402       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1403      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1404       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1405      #endif
1406     }
1407     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1408                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1409                                                      matrixT->values->begin()));
1410   }
1411   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1412   /* the compressed row indices is not used for matTranspose */
1413   matstructT->cprowIndices = NULL;
1414   /* assign the pointer */
1415   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1416   A->transupdated = PETSC_TRUE;
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1421 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1422 {
1423   PetscInt                              n = xx->map->n;
1424   const PetscScalar                     *barray;
1425   PetscScalar                           *xarray;
1426   thrust::device_ptr<const PetscScalar> bGPU;
1427   thrust::device_ptr<PetscScalar>       xGPU;
1428   cusparseStatus_t                      stat;
1429   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1430   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1431   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1432   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1433   PetscErrorCode                        ierr;
1434   cudaError_t                           cerr;
1435 
1436   PetscFunctionBegin;
1437   /* Analyze the matrix and create the transpose ... on the fly */
1438   if (!loTriFactorT && !upTriFactorT) {
1439     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1440     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1441     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1442   }
1443 
1444   /* Get the GPU pointers */
1445   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1446   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1447   xGPU = thrust::device_pointer_cast(xarray);
1448   bGPU = thrust::device_pointer_cast(barray);
1449 
1450   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1451   /* First, reorder with the row permutation */
1452   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1453                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1454                xGPU);
1455 
1456   /* First, solve U */
1457   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1458                         upTriFactorT->csrMat->num_rows,
1459                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1460                         upTriFactorT->csrMat->num_entries,
1461                       #endif
1462                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1463                         upTriFactorT->csrMat->values->data().get(),
1464                         upTriFactorT->csrMat->row_offsets->data().get(),
1465                         upTriFactorT->csrMat->column_indices->data().get(),
1466                         upTriFactorT->solveInfo,
1467                         xarray, tempGPU->data().get()
1468                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1469                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1470                       #endif
1471 );CHKERRCUSPARSE(stat);
1472 
1473   /* Then, solve L */
1474   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1475                         loTriFactorT->csrMat->num_rows,
1476                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477                         loTriFactorT->csrMat->num_entries,
1478                       #endif
1479                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1480                         loTriFactorT->csrMat->values->data().get(),
1481                         loTriFactorT->csrMat->row_offsets->data().get(),
1482                         loTriFactorT->csrMat->column_indices->data().get(),
1483                         loTriFactorT->solveInfo,
1484                         tempGPU->data().get(), xarray
1485                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1487                       #endif
1488 );CHKERRCUSPARSE(stat);
1489 
1490   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1491   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1492                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1493                tempGPU->begin());
1494 
1495   /* Copy the temporary to the full solution. */
1496   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1497 
1498   /* restore */
1499   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1500   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1501   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1502   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1503   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1504   PetscFunctionReturn(0);
1505 }
1506 
1507 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1508 {
1509   const PetscScalar                 *barray;
1510   PetscScalar                       *xarray;
1511   cusparseStatus_t                  stat;
1512   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1513   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1514   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1515   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1516   PetscErrorCode                    ierr;
1517   cudaError_t                       cerr;
1518 
1519   PetscFunctionBegin;
1520   /* Analyze the matrix and create the transpose ... on the fly */
1521   if (!loTriFactorT && !upTriFactorT) {
1522     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1523     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1524     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1525   }
1526 
1527   /* Get the GPU pointers */
1528   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530 
1531   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1532   /* First, solve U */
1533   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1534                         upTriFactorT->csrMat->num_rows,
1535                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1536                         upTriFactorT->csrMat->num_entries,
1537                       #endif
1538                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1539                         upTriFactorT->csrMat->values->data().get(),
1540                         upTriFactorT->csrMat->row_offsets->data().get(),
1541                         upTriFactorT->csrMat->column_indices->data().get(),
1542                         upTriFactorT->solveInfo,
1543                         barray, tempGPU->data().get()
1544                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1546                       #endif
1547 );CHKERRCUSPARSE(stat);
1548 
1549   /* Then, solve L */
1550   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551                         loTriFactorT->csrMat->num_rows,
1552                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553                         loTriFactorT->csrMat->num_entries,
1554                       #endif
1555                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556                         loTriFactorT->csrMat->values->data().get(),
1557                         loTriFactorT->csrMat->row_offsets->data().get(),
1558                         loTriFactorT->csrMat->column_indices->data().get(),
1559                         loTriFactorT->solveInfo,
1560                         tempGPU->data().get(), xarray
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1563                       #endif
1564 );CHKERRCUSPARSE(stat);
1565 
1566   /* restore */
1567   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1568   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1569   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1570   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1571   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1576 {
1577   const PetscScalar                     *barray;
1578   PetscScalar                           *xarray;
1579   thrust::device_ptr<const PetscScalar> bGPU;
1580   thrust::device_ptr<PetscScalar>       xGPU;
1581   cusparseStatus_t                      stat;
1582   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1583   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1584   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1585   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1586   PetscErrorCode                        ierr;
1587   cudaError_t                           cerr;
1588 
1589   PetscFunctionBegin;
1590 
1591   /* Get the GPU pointers */
1592   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1593   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1594   xGPU = thrust::device_pointer_cast(xarray);
1595   bGPU = thrust::device_pointer_cast(barray);
1596 
1597   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1598   /* First, reorder with the row permutation */
1599   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1600                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1601                tempGPU->begin());
1602 
1603   /* Next, solve L */
1604   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1605                         loTriFactor->csrMat->num_rows,
1606                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1607                         loTriFactor->csrMat->num_entries,
1608                       #endif
1609                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1610                         loTriFactor->csrMat->values->data().get(),
1611                         loTriFactor->csrMat->row_offsets->data().get(),
1612                         loTriFactor->csrMat->column_indices->data().get(),
1613                         loTriFactor->solveInfo,
1614                         tempGPU->data().get(), xarray
1615                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1616                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1617                       #endif
1618 );CHKERRCUSPARSE(stat);
1619 
1620   /* Then, solve U */
1621   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1622                         upTriFactor->csrMat->num_rows,
1623                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624                         upTriFactor->csrMat->num_entries,
1625                       #endif
1626                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1627                         upTriFactor->csrMat->values->data().get(),
1628                         upTriFactor->csrMat->row_offsets->data().get(),
1629                         upTriFactor->csrMat->column_indices->data().get(),
1630                         upTriFactor->solveInfo,
1631                         xarray, tempGPU->data().get()
1632                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1634                       #endif
1635 );CHKERRCUSPARSE(stat);
1636 
1637   /* Last, reorder with the column permutation */
1638   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1639                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1640                xGPU);
1641 
1642   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1644   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1645   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1646   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1651 {
1652   const PetscScalar                 *barray;
1653   PetscScalar                       *xarray;
1654   cusparseStatus_t                  stat;
1655   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1656   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1657   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1658   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1659   PetscErrorCode                    ierr;
1660   cudaError_t                       cerr;
1661 
1662   PetscFunctionBegin;
1663   /* Get the GPU pointers */
1664   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1665   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1666 
1667   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1668   /* First, solve L */
1669   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1670                         loTriFactor->csrMat->num_rows,
1671                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1672                         loTriFactor->csrMat->num_entries,
1673                       #endif
1674                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1675                         loTriFactor->csrMat->values->data().get(),
1676                         loTriFactor->csrMat->row_offsets->data().get(),
1677                         loTriFactor->csrMat->column_indices->data().get(),
1678                         loTriFactor->solveInfo,
1679                         barray, tempGPU->data().get()
1680                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1681                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1682                       #endif
1683 );CHKERRCUSPARSE(stat);
1684 
1685   /* Next, solve U */
1686   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687                         upTriFactor->csrMat->num_rows,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         upTriFactor->csrMat->num_entries,
1690                       #endif
1691                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692                         upTriFactor->csrMat->values->data().get(),
1693                         upTriFactor->csrMat->row_offsets->data().get(),
1694                         upTriFactor->csrMat->column_indices->data().get(),
1695                         upTriFactor->solveInfo,
1696                         tempGPU->data().get(), xarray
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1699                       #endif
1700 );CHKERRCUSPARSE(stat);
1701 
1702   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1703   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1704   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1705   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1706   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1707   PetscFunctionReturn(0);
1708 }
1709 
1710 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1711 {
1712   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1713   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1714   cudaError_t        cerr;
1715   PetscErrorCode     ierr;
1716 
1717   PetscFunctionBegin;
1718   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1719     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1720 
1721     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1722     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1723     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1724     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1725     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1726     A->offloadmask = PETSC_OFFLOAD_BOTH;
1727   }
1728   PetscFunctionReturn(0);
1729 }
1730 
1731 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1732 {
1733   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1734   PetscErrorCode ierr;
1735 
1736   PetscFunctionBegin;
1737   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1738   *array = a->a;
1739   A->offloadmask = PETSC_OFFLOAD_CPU;
1740   PetscFunctionReturn(0);
1741 }
1742 
1743 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1744 {
1745   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1746   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1747   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1748   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1749   PetscErrorCode               ierr;
1750   cusparseStatus_t             stat;
1751   PetscBool                    both = PETSC_TRUE;
1752   cudaError_t                  err;
1753 
1754   PetscFunctionBegin;
1755   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1756   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1757     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1758       CsrMatrix *matrix;
1759       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1760 
1761       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1762       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1763       matrix->values->assign(a->a, a->a+a->nz);
1764       err  = WaitForCUDA();CHKERRCUDA(err);
1765       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1766       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1767       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1768     } else {
1769       PetscInt nnz;
1770       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1771       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1772       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1773       delete cusparsestruct->workVector;
1774       delete cusparsestruct->rowoffsets_gpu;
1775       cusparsestruct->workVector = NULL;
1776       cusparsestruct->rowoffsets_gpu = NULL;
1777       try {
1778         if (a->compressedrow.use) {
1779           m    = a->compressedrow.nrows;
1780           ii   = a->compressedrow.i;
1781           ridx = a->compressedrow.rindex;
1782         } else {
1783           m    = A->rmap->n;
1784           ii   = a->i;
1785           ridx = NULL;
1786         }
1787         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1788         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1789         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1790         else nnz = a->nz;
1791 
1792         /* create cusparse matrix */
1793         cusparsestruct->nrows = m;
1794         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1795         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1796         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1797         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1798 
1799         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1800         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1801         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1802         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1803         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1804         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1805         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1806 
1807         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1808         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1809           /* set the matrix */
1810           CsrMatrix *mat= new CsrMatrix;
1811           mat->num_rows = m;
1812           mat->num_cols = A->cmap->n;
1813           mat->num_entries = nnz;
1814           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1815           mat->row_offsets->assign(ii, ii + m+1);
1816 
1817           mat->column_indices = new THRUSTINTARRAY32(nnz);
1818           mat->column_indices->assign(a->j, a->j+nnz);
1819 
1820           mat->values = new THRUSTARRAY(nnz);
1821           if (a->a) mat->values->assign(a->a, a->a+nnz);
1822 
1823           /* assign the pointer */
1824           matstruct->mat = mat;
1825          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1826           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1827             stat = cusparseCreateCsr(&matstruct->matDescr,
1828                                     mat->num_rows, mat->num_cols, mat->num_entries,
1829                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1830                                     mat->values->data().get(),
1831                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1832                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1833           }
1834          #endif
1835         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1836          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1837           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1838          #else
1839           CsrMatrix *mat= new CsrMatrix;
1840           mat->num_rows = m;
1841           mat->num_cols = A->cmap->n;
1842           mat->num_entries = nnz;
1843           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1844           mat->row_offsets->assign(ii, ii + m+1);
1845 
1846           mat->column_indices = new THRUSTINTARRAY32(nnz);
1847           mat->column_indices->assign(a->j, a->j+nnz);
1848 
1849           mat->values = new THRUSTARRAY(nnz);
1850           if (a->a) mat->values->assign(a->a, a->a+nnz);
1851 
1852           cusparseHybMat_t hybMat;
1853           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1854           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1855             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1856           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1857               matstruct->descr, mat->values->data().get(),
1858               mat->row_offsets->data().get(),
1859               mat->column_indices->data().get(),
1860               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1861           /* assign the pointer */
1862           matstruct->mat = hybMat;
1863 
1864           if (mat) {
1865             if (mat->values) delete (THRUSTARRAY*)mat->values;
1866             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1867             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1868             delete (CsrMatrix*)mat;
1869           }
1870          #endif
1871         }
1872 
1873         /* assign the compressed row indices */
1874         if (a->compressedrow.use) {
1875           cusparsestruct->workVector = new THRUSTARRAY(m);
1876           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1877           matstruct->cprowIndices->assign(ridx,ridx+m);
1878           tmp = m;
1879         } else {
1880           cusparsestruct->workVector = NULL;
1881           matstruct->cprowIndices    = NULL;
1882           tmp = 0;
1883         }
1884         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1885 
1886         /* assign the pointer */
1887         cusparsestruct->mat = matstruct;
1888       } catch(char *ex) {
1889         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1890       }
1891       err  = WaitForCUDA();CHKERRCUDA(err);
1892       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1893       cusparsestruct->nonzerostate = A->nonzerostate;
1894     }
1895     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1896   }
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 struct VecCUDAPlusEquals
1901 {
1902   template <typename Tuple>
1903   __host__ __device__
1904   void operator()(Tuple t)
1905   {
1906     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1907   }
1908 };
1909 
1910 struct VecCUDAEquals
1911 {
1912   template <typename Tuple>
1913   __host__ __device__
1914   void operator()(Tuple t)
1915   {
1916     thrust::get<1>(t) = thrust::get<0>(t);
1917   }
1918 };
1919 
1920 struct VecCUDAEqualsReverse
1921 {
1922   template <typename Tuple>
1923   __host__ __device__
1924   void operator()(Tuple t)
1925   {
1926     thrust::get<0>(t) = thrust::get<1>(t);
1927   }
1928 };
1929 
1930 struct MatMatCusparse {
1931   PetscBool             cisdense;
1932   PetscScalar           *Bt;
1933   Mat                   X;
1934   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1935   PetscLogDouble        flops;
1936   CsrMatrix             *Bcsr;
1937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938   cusparseSpMatDescr_t  matSpBDescr;
1939   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1940   cusparseDnMatDescr_t  matBDescr;
1941   cusparseDnMatDescr_t  matCDescr;
1942   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1943   size_t                mmBufferSize;
1944   void                  *mmBuffer;
1945   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1946   cusparseSpGEMMDescr_t spgemmDesc;
1947 #endif
1948 };
1949 
1950 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1951 {
1952   PetscErrorCode   ierr;
1953   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1954   cudaError_t      cerr;
1955  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1956   cusparseStatus_t stat;
1957  #endif
1958 
1959   PetscFunctionBegin;
1960   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1961   delete mmdata->Bcsr;
1962  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1963   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1964   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1965   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1966   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1967   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1968   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1969  #endif
1970   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1971   ierr = PetscFree(data);CHKERRQ(ierr);
1972   PetscFunctionReturn(0);
1973 }
1974 
1975 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1976 
1977 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1978 {
1979   Mat_Product                  *product = C->product;
1980   Mat                          A,B;
1981   PetscInt                     m,n,blda,clda;
1982   PetscBool                    flg,biscuda;
1983   Mat_SeqAIJCUSPARSE           *cusp;
1984   cusparseStatus_t             stat;
1985   cusparseOperation_t          opA;
1986   const PetscScalar            *barray;
1987   PetscScalar                  *carray;
1988   PetscErrorCode               ierr;
1989   MatMatCusparse               *mmdata;
1990   Mat_SeqAIJCUSPARSEMultStruct *mat;
1991   CsrMatrix                    *csrmat;
1992   cudaError_t                  cerr;
1993 
1994   PetscFunctionBegin;
1995   MatCheckProduct(C,1);
1996   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1997   mmdata = (MatMatCusparse*)product->data;
1998   A    = product->A;
1999   B    = product->B;
2000   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2001   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2002   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2003      Instead of silently accepting the wrong answer, I prefer to raise the error */
2004   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2005   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2006   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2007   switch (product->type) {
2008   case MATPRODUCT_AB:
2009   case MATPRODUCT_PtAP:
2010     mat = cusp->mat;
2011     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012     m   = A->rmap->n;
2013     n   = B->cmap->n;
2014     break;
2015   case MATPRODUCT_AtB:
2016     if (!A->form_explicit_transpose) {
2017       mat = cusp->mat;
2018       opA = CUSPARSE_OPERATION_TRANSPOSE;
2019     } else {
2020       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2021       mat  = cusp->matTranspose;
2022       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2023     }
2024     m = A->cmap->n;
2025     n = B->cmap->n;
2026     break;
2027   case MATPRODUCT_ABt:
2028   case MATPRODUCT_RARt:
2029     mat = cusp->mat;
2030     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2031     m   = A->rmap->n;
2032     n   = B->rmap->n;
2033     break;
2034   default:
2035     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2036   }
2037   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2038   csrmat = (CsrMatrix*)mat->mat;
2039   /* if the user passed a CPU matrix, copy the data to the GPU */
2040   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2041   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2042   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2043 
2044   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2045   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2046     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2047     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2048   } else {
2049     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2050     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2051   }
2052 
2053   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2054  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2055   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2056   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2057   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2058     size_t mmBufferSize;
2059     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2060     if (!mmdata->matBDescr) {
2061       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2062       mmdata->Blda = blda;
2063     }
2064 
2065     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2066     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2067       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2068       mmdata->Clda = clda;
2069     }
2070 
2071     if (!mat->matDescr) {
2072       stat = cusparseCreateCsr(&mat->matDescr,
2073                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2074                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2075                                csrmat->values->data().get(),
2076                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2077                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2078     }
2079     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2080                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2081                                    mmdata->matCDescr,cusparse_scalartype,
2082                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2083     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2084       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2085       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2086       mmdata->mmBufferSize = mmBufferSize;
2087     }
2088     mmdata->initialized = PETSC_TRUE;
2089   } else {
2090     /* to be safe, always update pointers of the mats */
2091     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2092     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2093     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2094   }
2095 
2096   /* do cusparseSpMM, which supports transpose on B */
2097   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2098                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2099                       mmdata->matCDescr,cusparse_scalartype,
2100                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2101  #else
2102   PetscInt k;
2103   /* cusparseXcsrmm does not support transpose on B */
2104   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2105     cublasHandle_t cublasv2handle;
2106     cublasStatus_t cerr;
2107 
2108     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2109     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2110                        B->cmap->n,B->rmap->n,
2111                        &PETSC_CUSPARSE_ONE ,barray,blda,
2112                        &PETSC_CUSPARSE_ZERO,barray,blda,
2113                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2114     blda = B->cmap->n;
2115     k    = B->cmap->n;
2116   } else {
2117     k    = B->rmap->n;
2118   }
2119 
2120   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2121   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2122                            csrmat->num_entries,mat->alpha_one,mat->descr,
2123                            csrmat->values->data().get(),
2124                            csrmat->row_offsets->data().get(),
2125                            csrmat->column_indices->data().get(),
2126                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2127                            carray,clda);CHKERRCUSPARSE(stat);
2128  #endif
2129   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2130   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2131   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2132   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2133   if (product->type == MATPRODUCT_RARt) {
2134     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2135     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2136   } else if (product->type == MATPRODUCT_PtAP) {
2137     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2138     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2139   } else {
2140     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2141   }
2142   if (mmdata->cisdense) {
2143     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2144   }
2145   if (!biscuda) {
2146     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2147   }
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2152 {
2153   Mat_Product        *product = C->product;
2154   Mat                A,B;
2155   PetscInt           m,n;
2156   PetscBool          cisdense,flg;
2157   PetscErrorCode     ierr;
2158   MatMatCusparse     *mmdata;
2159   Mat_SeqAIJCUSPARSE *cusp;
2160 
2161   PetscFunctionBegin;
2162   MatCheckProduct(C,1);
2163   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2164   A    = product->A;
2165   B    = product->B;
2166   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2167   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2168   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2169   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2170   switch (product->type) {
2171   case MATPRODUCT_AB:
2172     m = A->rmap->n;
2173     n = B->cmap->n;
2174     break;
2175   case MATPRODUCT_AtB:
2176     m = A->cmap->n;
2177     n = B->cmap->n;
2178     break;
2179   case MATPRODUCT_ABt:
2180     m = A->rmap->n;
2181     n = B->rmap->n;
2182     break;
2183   case MATPRODUCT_PtAP:
2184     m = B->cmap->n;
2185     n = B->cmap->n;
2186     break;
2187   case MATPRODUCT_RARt:
2188     m = B->rmap->n;
2189     n = B->rmap->n;
2190     break;
2191   default:
2192     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2193   }
2194   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2195   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2196   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2197   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2198 
2199   /* product data */
2200   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2201   mmdata->cisdense = cisdense;
2202  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2203   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2204   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2205     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2206   }
2207  #endif
2208   /* for these products we need intermediate storage */
2209   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2210     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2211     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2212     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2213       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2214     } else {
2215       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2216     }
2217   }
2218   C->product->data    = mmdata;
2219   C->product->destroy = MatDestroy_MatMatCusparse;
2220 
2221   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2222   PetscFunctionReturn(0);
2223 }
2224 
2225 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2226 {
2227   Mat_Product                  *product = C->product;
2228   Mat                          A,B;
2229   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2230   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2231   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2232   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2233   PetscBool                    flg;
2234   PetscErrorCode               ierr;
2235   cusparseStatus_t             stat;
2236   cudaError_t                  cerr;
2237   MatProductType               ptype;
2238   MatMatCusparse               *mmdata;
2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2240   cusparseSpMatDescr_t         BmatSpDescr;
2241 #endif
2242 
2243   PetscFunctionBegin;
2244   MatCheckProduct(C,1);
2245   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2246   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2248   mmdata = (MatMatCusparse*)C->product->data;
2249   A = product->A;
2250   B = product->B;
2251   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2252     mmdata->reusesym = PETSC_FALSE;
2253     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2254     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2255     Cmat = Ccusp->mat;
2256     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2257     Ccsr = (CsrMatrix*)Cmat->mat;
2258     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2259     goto finalize;
2260   }
2261   if (!c->nz) goto finalize;
2262   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2263   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2264   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2265   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2266   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2267   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2268   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2269   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2270   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2271   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2272   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2273   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2274   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2275   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2276 
2277   ptype = product->type;
2278   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2279   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2280   switch (ptype) {
2281   case MATPRODUCT_AB:
2282     Amat = Acusp->mat;
2283     Bmat = Bcusp->mat;
2284     break;
2285   case MATPRODUCT_AtB:
2286     Amat = Acusp->matTranspose;
2287     Bmat = Bcusp->mat;
2288     break;
2289   case MATPRODUCT_ABt:
2290     Amat = Acusp->mat;
2291     Bmat = Bcusp->matTranspose;
2292     break;
2293   default:
2294     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2295   }
2296   Cmat = Ccusp->mat;
2297   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2298   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2299   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2300   Acsr = (CsrMatrix*)Amat->mat;
2301   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2302   Ccsr = (CsrMatrix*)Cmat->mat;
2303   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2304   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2305   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2308   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2309   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2310                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2311                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2312                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2313   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2314                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2315                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2316 #else
2317   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2318                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2319                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2320                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2321                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2322 #endif
2323   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2324   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2325   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2326   C->offloadmask = PETSC_OFFLOAD_GPU;
2327 finalize:
2328   /* shorter version of MatAssemblyEnd_SeqAIJ */
2329   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2330   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2331   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2332   c->reallocs         = 0;
2333   C->info.mallocs    += 0;
2334   C->info.nz_unneeded = 0;
2335   C->assembled = C->was_assembled = PETSC_TRUE;
2336   C->num_ass++;
2337   PetscFunctionReturn(0);
2338 }
2339 
2340 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2341 {
2342   Mat_Product                  *product = C->product;
2343   Mat                          A,B;
2344   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2345   Mat_SeqAIJ                   *a,*b,*c;
2346   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2347   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2348   PetscInt                     i,j,m,n,k;
2349   PetscBool                    flg;
2350   PetscErrorCode               ierr;
2351   cusparseStatus_t             stat;
2352   cudaError_t                  cerr;
2353   MatProductType               ptype;
2354   MatMatCusparse               *mmdata;
2355   PetscLogDouble               flops;
2356   PetscBool                    biscompressed,ciscompressed;
2357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2358   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2359   size_t                       bufSize2;
2360   cusparseSpMatDescr_t         BmatSpDescr;
2361 #else
2362   int                          cnz;
2363 #endif
2364 
2365   PetscFunctionBegin;
2366   MatCheckProduct(C,1);
2367   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2368   A    = product->A;
2369   B    = product->B;
2370   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2371   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2372   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2373   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2374   a = (Mat_SeqAIJ*)A->data;
2375   b = (Mat_SeqAIJ*)B->data;
2376   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2377   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2378   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2379   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2380 
2381   /* product data */
2382   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2383   C->product->data    = mmdata;
2384   C->product->destroy = MatDestroy_MatMatCusparse;
2385 
2386   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2387   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2388   ptype = product->type;
2389   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2390   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2391   biscompressed = PETSC_FALSE;
2392   ciscompressed = PETSC_FALSE;
2393   switch (ptype) {
2394   case MATPRODUCT_AB:
2395     m = A->rmap->n;
2396     n = B->cmap->n;
2397     k = A->cmap->n;
2398     Amat = Acusp->mat;
2399     Bmat = Bcusp->mat;
2400     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2402     break;
2403   case MATPRODUCT_AtB:
2404     m = A->cmap->n;
2405     n = B->cmap->n;
2406     k = A->rmap->n;
2407     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2408     Amat = Acusp->matTranspose;
2409     Bmat = Bcusp->mat;
2410     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2411     break;
2412   case MATPRODUCT_ABt:
2413     m = A->rmap->n;
2414     n = B->rmap->n;
2415     k = A->cmap->n;
2416     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2417     Amat = Acusp->mat;
2418     Bmat = Bcusp->matTranspose;
2419     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2420     break;
2421   default:
2422     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2423   }
2424 
2425   /* create cusparse matrix */
2426   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2427   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2428   c     = (Mat_SeqAIJ*)C->data;
2429   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2430   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2431   Ccsr  = new CsrMatrix;
2432 
2433   c->compressedrow.use = ciscompressed;
2434   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2435     c->compressedrow.nrows = a->compressedrow.nrows;
2436     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2437     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2438     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2439     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2440     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2441   } else {
2442     c->compressedrow.nrows  = 0;
2443     c->compressedrow.i      = NULL;
2444     c->compressedrow.rindex = NULL;
2445     Ccusp->workVector       = NULL;
2446     Cmat->cprowIndices      = NULL;
2447   }
2448   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2449   Ccusp->mat      = Cmat;
2450   Ccusp->mat->mat = Ccsr;
2451   Ccsr->num_rows    = Ccusp->nrows;
2452   Ccsr->num_cols    = n;
2453   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2454   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2455   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2456   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2457   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2458   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2459   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2460   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2461   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2462   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2463   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2464     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2465     c->nz = 0;
2466     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2467     Ccsr->values = new THRUSTARRAY(c->nz);
2468     goto finalizesym;
2469   }
2470 
2471   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2472   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2473   Acsr = (CsrMatrix*)Amat->mat;
2474   if (!biscompressed) {
2475     Bcsr = (CsrMatrix*)Bmat->mat;
2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2477     BmatSpDescr = Bmat->matDescr;
2478 #endif
2479   } else { /* we need to use row offsets for the full matrix */
2480     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2481     Bcsr = new CsrMatrix;
2482     Bcsr->num_rows       = B->rmap->n;
2483     Bcsr->num_cols       = cBcsr->num_cols;
2484     Bcsr->num_entries    = cBcsr->num_entries;
2485     Bcsr->column_indices = cBcsr->column_indices;
2486     Bcsr->values         = cBcsr->values;
2487     if (!Bcusp->rowoffsets_gpu) {
2488       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2489       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2490       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2491     }
2492     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2493     mmdata->Bcsr = Bcsr;
2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495     if (Bcsr->num_rows && Bcsr->num_cols) {
2496       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2497                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2498                                Bcsr->values->data().get(),
2499                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2500                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2501     }
2502     BmatSpDescr = mmdata->matSpBDescr;
2503 #endif
2504   }
2505   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2506   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2507   /* precompute flops count */
2508   if (ptype == MATPRODUCT_AB) {
2509     for (i=0, flops = 0; i<A->rmap->n; i++) {
2510       const PetscInt st = a->i[i];
2511       const PetscInt en = a->i[i+1];
2512       for (j=st; j<en; j++) {
2513         const PetscInt brow = a->j[j];
2514         flops += 2.*(b->i[brow+1] - b->i[brow]);
2515       }
2516     }
2517   } else if (ptype == MATPRODUCT_AtB) {
2518     for (i=0, flops = 0; i<A->rmap->n; i++) {
2519       const PetscInt anzi = a->i[i+1] - a->i[i];
2520       const PetscInt bnzi = b->i[i+1] - b->i[i];
2521       flops += (2.*anzi)*bnzi;
2522     }
2523   } else { /* TODO */
2524     flops = 0.;
2525   }
2526 
2527   mmdata->flops = flops;
2528   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2530   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2531   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2532                            NULL, NULL, NULL,
2533                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2534                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2535   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2536   /* ask bufferSize bytes for external memory */
2537   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2538                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2539                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2540                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2541   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2542   /* inspect the matrices A and B to understand the memory requirement for the next step */
2543   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2547   /* ask bufferSize again bytes for external memory */
2548   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2549                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2550                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2551                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2552   /* The CUSPARSE documentation is not clear, nor the API
2553      We need both buffers to perform the operations properly!
2554      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2555      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2556      is stored in the descriptor! What a messy API... */
2557   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2558   /* compute the intermediate product of A * B */
2559   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2561                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2562                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2563   /* get matrix C non-zero entries C_nnz1 */
2564   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2565   c->nz = (PetscInt) C_nnz1;
2566   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2567   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2568   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2569   Ccsr->values = new THRUSTARRAY(c->nz);
2570   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2572                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2573   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2575                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2576 #else
2577   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2578   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2579                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2580                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2581                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2582                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2583   c->nz = cnz;
2584   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2585   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2586   Ccsr->values = new THRUSTARRAY(c->nz);
2587   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2588 
2589   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2590   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2591      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2592      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2593   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2594                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2595                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2596                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2597                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2598 #endif
2599   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2600   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2601   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2602 finalizesym:
2603   c->singlemalloc = PETSC_FALSE;
2604   c->free_a       = PETSC_TRUE;
2605   c->free_ij      = PETSC_TRUE;
2606   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2607   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2608   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2609     PetscInt *d_i = c->i;
2610     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2611     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2612     ii   = *Ccsr->row_offsets;
2613     jj   = *Ccsr->column_indices;
2614     if (ciscompressed) d_i = c->compressedrow.i;
2615     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617   } else {
2618     PetscInt *d_i = c->i;
2619     if (ciscompressed) d_i = c->compressedrow.i;
2620     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2621     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2622   }
2623   if (ciscompressed) { /* need to expand host row offsets */
2624     PetscInt r = 0;
2625     c->i[0] = 0;
2626     for (k = 0; k < c->compressedrow.nrows; k++) {
2627       const PetscInt next = c->compressedrow.rindex[k];
2628       const PetscInt old = c->compressedrow.i[k];
2629       for (; r < next; r++) c->i[r+1] = old;
2630     }
2631     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2632   }
2633   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2634   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2635   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2636   c->maxnz = c->nz;
2637   c->nonzerorowcnt = 0;
2638   c->rmax = 0;
2639   for (k = 0; k < m; k++) {
2640     const PetscInt nn = c->i[k+1] - c->i[k];
2641     c->ilen[k] = c->imax[k] = nn;
2642     c->nonzerorowcnt += (PetscInt)!!nn;
2643     c->rmax = PetscMax(c->rmax,nn);
2644   }
2645   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2646   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2647   Ccsr->num_entries = c->nz;
2648 
2649   C->nonzerostate++;
2650   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2651   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2652   Ccusp->nonzerostate = C->nonzerostate;
2653   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2654   C->preallocated  = PETSC_TRUE;
2655   C->assembled     = PETSC_FALSE;
2656   C->was_assembled = PETSC_FALSE;
2657   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2658     mmdata->reusesym = PETSC_TRUE;
2659     C->offloadmask   = PETSC_OFFLOAD_GPU;
2660   }
2661   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2662   PetscFunctionReturn(0);
2663 }
2664 
2665 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2666 
2667 /* handles sparse or dense B */
2668 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2669 {
2670   Mat_Product    *product = mat->product;
2671   PetscErrorCode ierr;
2672   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2673 
2674   PetscFunctionBegin;
2675   MatCheckProduct(mat,1);
2676   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2677   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2678     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2679   }
2680   if (product->type == MATPRODUCT_ABC) {
2681     Ciscusp = PETSC_FALSE;
2682     if (!product->C->boundtocpu) {
2683       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2684     }
2685   }
2686   if (isdense) {
2687     switch (product->type) {
2688     case MATPRODUCT_AB:
2689     case MATPRODUCT_AtB:
2690     case MATPRODUCT_ABt:
2691     case MATPRODUCT_PtAP:
2692     case MATPRODUCT_RARt:
2693      if (product->A->boundtocpu) {
2694         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2695       } else {
2696         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2697       }
2698       break;
2699     case MATPRODUCT_ABC:
2700       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2701       break;
2702     default:
2703       break;
2704     }
2705   } else if (Biscusp && Ciscusp) {
2706     switch (product->type) {
2707     case MATPRODUCT_AB:
2708     case MATPRODUCT_AtB:
2709     case MATPRODUCT_ABt:
2710       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2711       break;
2712     case MATPRODUCT_PtAP:
2713     case MATPRODUCT_RARt:
2714     case MATPRODUCT_ABC:
2715       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2716       break;
2717     default:
2718       break;
2719     }
2720   } else { /* fallback for AIJ */
2721     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2722   }
2723   PetscFunctionReturn(0);
2724 }
2725 
2726 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2727 {
2728   PetscErrorCode ierr;
2729 
2730   PetscFunctionBegin;
2731   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2732   PetscFunctionReturn(0);
2733 }
2734 
2735 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2736 {
2737   PetscErrorCode ierr;
2738 
2739   PetscFunctionBegin;
2740   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2741   PetscFunctionReturn(0);
2742 }
2743 
2744 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2745 {
2746   PetscErrorCode ierr;
2747 
2748   PetscFunctionBegin;
2749   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2750   PetscFunctionReturn(0);
2751 }
2752 
2753 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2754 {
2755   PetscErrorCode ierr;
2756 
2757   PetscFunctionBegin;
2758   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2759   PetscFunctionReturn(0);
2760 }
2761 
2762 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2763 {
2764   PetscErrorCode ierr;
2765 
2766   PetscFunctionBegin;
2767   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2768   PetscFunctionReturn(0);
2769 }
2770 
2771 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2772 {
2773   int i = blockIdx.x*blockDim.x + threadIdx.x;
2774   if (i < n) y[idx[i]] += x[i];
2775 }
2776 
2777 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2778 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2779 {
2780   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2781   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2782   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2783   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2784   PetscErrorCode               ierr;
2785   cudaError_t                  cerr;
2786   cusparseStatus_t             stat;
2787   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2788   PetscBool                    compressed;
2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2790   PetscInt                     nx,ny;
2791 #endif
2792 
2793   PetscFunctionBegin;
2794   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2795   if (!a->nonzerorowcnt) {
2796     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2797     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2798     PetscFunctionReturn(0);
2799   }
2800   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2801   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2802   if (!trans) {
2803     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2804     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2805   } else {
2806     if (herm || !A->form_explicit_transpose) {
2807       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2808       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2809     } else {
2810       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2811       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2812     }
2813   }
2814   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2815   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2816 
2817   try {
2818     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2819     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2820     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2821 
2822     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2823     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2824       /* z = A x + beta y.
2825          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2826          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2827       */
2828       xptr = xarray;
2829       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2830       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2831      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2832       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2833           allocated to accommodate different uses. So we get the length info directly from mat.
2834        */
2835       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2836         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2837         nx = mat->num_cols;
2838         ny = mat->num_rows;
2839       }
2840      #endif
2841     } else {
2842       /* z = A^T x + beta y
2843          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2844          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2845        */
2846       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2847       dptr = zarray;
2848       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2849       if (compressed) { /* Scatter x to work vector */
2850         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2851         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2852                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2853                          VecCUDAEqualsReverse());
2854       }
2855      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2856       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2857         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2858         nx = mat->num_rows;
2859         ny = mat->num_cols;
2860       }
2861      #endif
2862     }
2863 
2864     /* csr_spmv does y = alpha op(A) x + beta y */
2865     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2866      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2867       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2868       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2869         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2870         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2871         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2872                                 matstruct->matDescr,
2873                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2874                                 matstruct->cuSpMV[opA].vecYDescr,
2875                                 cusparse_scalartype,
2876                                 cusparsestruct->spmvAlg,
2877                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2878         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2879 
2880         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2881       } else {
2882         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2883         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2884         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2885       }
2886 
2887       stat = cusparseSpMV(cusparsestruct->handle, opA,
2888                                matstruct->alpha_one,
2889                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2890                                matstruct->cuSpMV[opA].vecXDescr,
2891                                beta,
2892                                matstruct->cuSpMV[opA].vecYDescr,
2893                                cusparse_scalartype,
2894                                cusparsestruct->spmvAlg,
2895                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2896      #else
2897       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2898       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2899                                mat->num_rows, mat->num_cols,
2900                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2901                                mat->values->data().get(), mat->row_offsets->data().get(),
2902                                mat->column_indices->data().get(), xptr, beta,
2903                                dptr);CHKERRCUSPARSE(stat);
2904      #endif
2905     } else {
2906       if (cusparsestruct->nrows) {
2907        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2908         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2909        #else
2910         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2911         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2912                                  matstruct->alpha_one, matstruct->descr, hybMat,
2913                                  xptr, beta,
2914                                  dptr);CHKERRCUSPARSE(stat);
2915        #endif
2916       }
2917     }
2918     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2919     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2920 
2921     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2922       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2923         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2924           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2925         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2926           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2927         }
2928       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2929         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2930       }
2931 
2932       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2933       if (compressed) {
2934         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2935         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2936            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2937            prevent that. So I just add a ScatterAdd kernel.
2938          */
2939        #if 0
2940         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2941         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2942                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2943                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2944                          VecCUDAPlusEquals());
2945        #else
2946         PetscInt n = matstruct->cprowIndices->size();
2947         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2948        #endif
2949         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2950         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2951       }
2952     } else {
2953       if (yy && yy != zz) {
2954         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2955       }
2956     }
2957     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2958     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2959     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2960   } catch(char *ex) {
2961     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2962   }
2963   if (yy) {
2964     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2965   } else {
2966     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2967   }
2968   PetscFunctionReturn(0);
2969 }
2970 
2971 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2972 {
2973   PetscErrorCode ierr;
2974 
2975   PetscFunctionBegin;
2976   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2977   PetscFunctionReturn(0);
2978 }
2979 
2980 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2981 {
2982   PetscErrorCode              ierr;
2983   PetscSplitCSRDataStructure  *d_mat = NULL;
2984   PetscFunctionBegin;
2985   if (A->factortype == MAT_FACTOR_NONE) {
2986     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2987   }
2988   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2989   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2990   if (d_mat) {
2991     A->offloadmask = PETSC_OFFLOAD_GPU;
2992   }
2993 
2994   PetscFunctionReturn(0);
2995 }
2996 
2997 /* --------------------------------------------------------------------------------*/
2998 /*@
2999    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3000    (the default parallel PETSc format). This matrix will ultimately pushed down
3001    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3002    assembly performance the user should preallocate the matrix storage by setting
3003    the parameter nz (or the array nnz).  By setting these parameters accurately,
3004    performance during matrix assembly can be increased by more than a factor of 50.
3005 
3006    Collective
3007 
3008    Input Parameters:
3009 +  comm - MPI communicator, set to PETSC_COMM_SELF
3010 .  m - number of rows
3011 .  n - number of columns
3012 .  nz - number of nonzeros per row (same for all rows)
3013 -  nnz - array containing the number of nonzeros in the various rows
3014          (possibly different for each row) or NULL
3015 
3016    Output Parameter:
3017 .  A - the matrix
3018 
3019    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3020    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3021    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3022 
3023    Notes:
3024    If nnz is given then nz is ignored
3025 
3026    The AIJ format (also called the Yale sparse matrix format or
3027    compressed row storage), is fully compatible with standard Fortran 77
3028    storage.  That is, the stored row and column indices can begin at
3029    either one (as in Fortran) or zero.  See the users' manual for details.
3030 
3031    Specify the preallocated storage with either nz or nnz (not both).
3032    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3033    allocation.  For large problems you MUST preallocate memory or you
3034    will get TERRIBLE performance, see the users' manual chapter on matrices.
3035 
3036    By default, this format uses inodes (identical nodes) when possible, to
3037    improve numerical efficiency of matrix-vector products and solves. We
3038    search for consecutive rows with the same nonzero structure, thereby
3039    reusing matrix information to achieve increased efficiency.
3040 
3041    Level: intermediate
3042 
3043 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3044 @*/
3045 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3046 {
3047   PetscErrorCode ierr;
3048 
3049   PetscFunctionBegin;
3050   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3051   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3052   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3053   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3054   PetscFunctionReturn(0);
3055 }
3056 
3057 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3058 {
3059   PetscErrorCode              ierr;
3060   PetscSplitCSRDataStructure  *d_mat = NULL;
3061 
3062   PetscFunctionBegin;
3063   if (A->factortype == MAT_FACTOR_NONE) {
3064     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3065     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3066     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3067   } else {
3068     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3069   }
3070   if (d_mat) {
3071     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3072     cudaError_t                err;
3073     PetscSplitCSRDataStructure h_mat;
3074     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3075     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3076     if (a->compressedrow.use) {
3077       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3078     }
3079     err = cudaFree(d_mat);CHKERRCUDA(err);
3080   }
3081   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3082   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3083   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3084   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3085   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3086   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3087   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3088   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3089   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3090   PetscFunctionReturn(0);
3091 }
3092 
3093 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3094 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3095 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3096 {
3097   PetscErrorCode ierr;
3098 
3099   PetscFunctionBegin;
3100   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3101   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3102   PetscFunctionReturn(0);
3103 }
3104 
3105 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3106 {
3107   PetscErrorCode     ierr;
3108   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3109   Mat_SeqAIJCUSPARSE *cy;
3110   Mat_SeqAIJCUSPARSE *cx;
3111   PetscScalar        *ay;
3112   const PetscScalar  *ax;
3113   CsrMatrix          *csry,*csrx;
3114   cudaError_t        cerr;
3115 
3116   PetscFunctionBegin;
3117   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3118   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3119   if (X->ops->axpy != Y->ops->axpy) {
3120     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3121     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3122     PetscFunctionReturn(0);
3123   }
3124   /* if we are here, it means both matrices are bound to GPU */
3125   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3126   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3127   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3128   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3129   csry = (CsrMatrix*)cy->mat->mat;
3130   csrx = (CsrMatrix*)cx->mat->mat;
3131   /* see if we can turn this into a cublas axpy */
3132   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3133     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3134     if (eq) {
3135       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3136     }
3137     if (eq) str = SAME_NONZERO_PATTERN;
3138   }
3139   /* spgeam is buggy with one column */
3140   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3141 
3142   if (str == SUBSET_NONZERO_PATTERN) {
3143     cusparseStatus_t stat;
3144     PetscScalar      b = 1.0;
3145 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3146     size_t           bufferSize;
3147     void             *buffer;
3148 #endif
3149 
3150     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3151     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3152     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3153 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3154     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3155                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3156                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3157                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3158     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3159     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3160     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3161                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3162                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3163                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3164     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3165     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3166     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3167     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3168 #else
3169     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3170     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3171                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3172                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3173                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3174     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3175     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3176     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3177 #endif
3178     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3179     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3180     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3181     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3182   } else if (str == SAME_NONZERO_PATTERN) {
3183     cublasHandle_t cublasv2handle;
3184     cublasStatus_t berr;
3185     PetscBLASInt   one = 1, bnz = 1;
3186 
3187     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3188     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3189     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3190     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3191     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3192     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3193     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3194     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3195     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3196     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3197     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3198     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3199   } else {
3200     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3201     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3202   }
3203   PetscFunctionReturn(0);
3204 }
3205 
3206 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3207 {
3208   PetscErrorCode ierr;
3209   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3210   PetscScalar    *ay;
3211   cudaError_t    cerr;
3212   cublasHandle_t cublasv2handle;
3213   cublasStatus_t berr;
3214   PetscBLASInt   one = 1, bnz = 1;
3215 
3216   PetscFunctionBegin;
3217   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3218   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3219   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3220   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3221   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3222   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3223   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3224   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3225   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3226   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3227   PetscFunctionReturn(0);
3228 }
3229 
3230 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3231 {
3232   PetscErrorCode             ierr;
3233   PetscBool                  both = PETSC_FALSE;
3234   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3235 
3236   PetscFunctionBegin;
3237   if (A->factortype == MAT_FACTOR_NONE) {
3238     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3239     if (spptr->mat) {
3240       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3241       if (matrix->values) {
3242         both = PETSC_TRUE;
3243         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3244       }
3245     }
3246     if (spptr->matTranspose) {
3247       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3248       if (matrix->values) {
3249         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3250       }
3251     }
3252   }
3253   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3254   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3255   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3256   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3257   else A->offloadmask = PETSC_OFFLOAD_CPU;
3258 
3259   PetscFunctionReturn(0);
3260 }
3261 
3262 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3263 {
3264   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3265   PetscErrorCode ierr;
3266 
3267   PetscFunctionBegin;
3268   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3269   if (flg) {
3270     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3271 
3272     A->ops->scale                     = MatScale_SeqAIJ;
3273     A->ops->axpy                      = MatAXPY_SeqAIJ;
3274     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3275     A->ops->mult                      = MatMult_SeqAIJ;
3276     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3277     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3278     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3279     A->ops->multhermitiantranspose    = NULL;
3280     A->ops->multhermitiantransposeadd = NULL;
3281     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3282     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3283     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3284     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3285     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3286     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3287     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3288     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3289   } else {
3290     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3291     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3292     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3293     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3294     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3295     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3296     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3297     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3298     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3299     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3300     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3301     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3302     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3303     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3304     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3305     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3306     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3307   }
3308   A->boundtocpu = flg;
3309   a->inode.use = flg;
3310   PetscFunctionReturn(0);
3311 }
3312 
3313 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3314 {
3315   PetscErrorCode   ierr;
3316   cusparseStatus_t stat;
3317   Mat              B;
3318 
3319   PetscFunctionBegin;
3320   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3321   if (reuse == MAT_INITIAL_MATRIX) {
3322     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3323   } else if (reuse == MAT_REUSE_MATRIX) {
3324     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3325   }
3326   B = *newmat;
3327 
3328   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3329   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3330 
3331   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3332     if (B->factortype == MAT_FACTOR_NONE) {
3333       Mat_SeqAIJCUSPARSE *spptr;
3334       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3335       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3336       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3337       spptr->format     = MAT_CUSPARSE_CSR;
3338      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3339       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3340       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3341       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3342      #endif
3343       B->spptr = spptr;
3344     } else {
3345       Mat_SeqAIJCUSPARSETriFactors *spptr;
3346 
3347       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3348       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3349       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3350       B->spptr = spptr;
3351     }
3352     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3353   }
3354   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3355   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3356   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3357   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3358   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3359   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3360 
3361   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3362   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3363   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3364   PetscFunctionReturn(0);
3365 }
3366 
3367 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3368 {
3369   PetscErrorCode ierr;
3370 
3371   PetscFunctionBegin;
3372   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3373   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3374   PetscFunctionReturn(0);
3375 }
3376 
3377 /*MC
3378    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3379 
3380    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3381    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3382    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3383 
3384    Options Database Keys:
3385 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3386 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3387 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3388 
3389   Level: beginner
3390 
3391 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3392 M*/
3393 
3394 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3395 
3396 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3397 {
3398   PetscErrorCode ierr;
3399 
3400   PetscFunctionBegin;
3401   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3402   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3403   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3404   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3405   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3406 
3407   PetscFunctionReturn(0);
3408 }
3409 
3410 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3411 {
3412   PetscErrorCode   ierr;
3413   cusparseStatus_t stat;
3414 
3415   PetscFunctionBegin;
3416   if (*cusparsestruct) {
3417     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3418     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3419     delete (*cusparsestruct)->workVector;
3420     delete (*cusparsestruct)->rowoffsets_gpu;
3421     delete (*cusparsestruct)->cooPerm;
3422     delete (*cusparsestruct)->cooPerm_a;
3423     delete (*cusparsestruct)->csr2csc_i;
3424     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3425     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3426   }
3427   PetscFunctionReturn(0);
3428 }
3429 
3430 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3431 {
3432   PetscFunctionBegin;
3433   if (*mat) {
3434     delete (*mat)->values;
3435     delete (*mat)->column_indices;
3436     delete (*mat)->row_offsets;
3437     delete *mat;
3438     *mat = 0;
3439   }
3440   PetscFunctionReturn(0);
3441 }
3442 
3443 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3444 {
3445   cusparseStatus_t stat;
3446   PetscErrorCode   ierr;
3447 
3448   PetscFunctionBegin;
3449   if (*trifactor) {
3450     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3451     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3452     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3453     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3454     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3455    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3456     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3457    #endif
3458     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3459   }
3460   PetscFunctionReturn(0);
3461 }
3462 
3463 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3464 {
3465   CsrMatrix        *mat;
3466   cusparseStatus_t stat;
3467   cudaError_t      err;
3468 
3469   PetscFunctionBegin;
3470   if (*matstruct) {
3471     if ((*matstruct)->mat) {
3472       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3473        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3474         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3475        #else
3476         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3477         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3478        #endif
3479       } else {
3480         mat = (CsrMatrix*)(*matstruct)->mat;
3481         CsrMatrix_Destroy(&mat);
3482       }
3483     }
3484     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3485     delete (*matstruct)->cprowIndices;
3486     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3487     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3488     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3489 
3490    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3491     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3492     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3493     for (int i=0; i<3; i++) {
3494       if (mdata->cuSpMV[i].initialized) {
3495         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3496         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3497         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3498       }
3499     }
3500    #endif
3501     delete *matstruct;
3502     *matstruct = NULL;
3503   }
3504   PetscFunctionReturn(0);
3505 }
3506 
3507 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3508 {
3509   PetscErrorCode ierr;
3510 
3511   PetscFunctionBegin;
3512   if (*trifactors) {
3513     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3514     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3515     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3516     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3517     delete (*trifactors)->rpermIndices;
3518     delete (*trifactors)->cpermIndices;
3519     delete (*trifactors)->workVector;
3520     (*trifactors)->rpermIndices = NULL;
3521     (*trifactors)->cpermIndices = NULL;
3522     (*trifactors)->workVector = NULL;
3523     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3524     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3525   }
3526   PetscFunctionReturn(0);
3527 }
3528 
3529 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3530 {
3531   PetscErrorCode   ierr;
3532   cusparseHandle_t handle;
3533   cusparseStatus_t stat;
3534 
3535   PetscFunctionBegin;
3536   if (*trifactors) {
3537     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3538     if (handle = (*trifactors)->handle) {
3539       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3540     }
3541     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3542   }
3543   PetscFunctionReturn(0);
3544 }
3545 
3546 struct IJCompare
3547 {
3548   __host__ __device__
3549   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3550   {
3551     if (t1.get<0>() < t2.get<0>()) return true;
3552     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3553     return false;
3554   }
3555 };
3556 
3557 struct IJEqual
3558 {
3559   __host__ __device__
3560   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3561   {
3562     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3563     return true;
3564   }
3565 };
3566 
3567 struct IJDiff
3568 {
3569   __host__ __device__
3570   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3571   {
3572     return t1 == t2 ? 0 : 1;
3573   }
3574 };
3575 
3576 struct IJSum
3577 {
3578   __host__ __device__
3579   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3580   {
3581     return t1||t2;
3582   }
3583 };
3584 
3585 #include <thrust/iterator/discard_iterator.h>
3586 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3587 {
3588   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3589   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3590   THRUSTARRAY                           *cooPerm_v = NULL;
3591   thrust::device_ptr<const PetscScalar> d_v;
3592   CsrMatrix                             *matrix;
3593   PetscErrorCode                        ierr;
3594   cudaError_t                           cerr;
3595   PetscInt                              n;
3596 
3597   PetscFunctionBegin;
3598   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3599   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3600   if (!cusp->cooPerm) {
3601     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3602     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3603     PetscFunctionReturn(0);
3604   }
3605   matrix = (CsrMatrix*)cusp->mat->mat;
3606   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3607   if (!v) {
3608     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3609     goto finalize;
3610   }
3611   n = cusp->cooPerm->size();
3612   if (isCudaMem(v)) {
3613     d_v = thrust::device_pointer_cast(v);
3614   } else {
3615     cooPerm_v = new THRUSTARRAY(n);
3616     cooPerm_v->assign(v,v+n);
3617     d_v = cooPerm_v->data();
3618     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3619   }
3620   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3621   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3622     if (cusp->cooPerm_a) {
3623       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3624       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3625       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3626       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3627       delete cooPerm_w;
3628     } else {
3629       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3630                                                                 matrix->values->begin()));
3631       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3632                                                                 matrix->values->end()));
3633       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3634     }
3635   } else {
3636     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3637       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3638       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3639     } else {
3640       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3641                                                                 matrix->values->begin()));
3642       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3643                                                                 matrix->values->end()));
3644       thrust::for_each(zibit,zieit,VecCUDAEquals());
3645     }
3646   }
3647   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3648   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3649 finalize:
3650   delete cooPerm_v;
3651   A->offloadmask = PETSC_OFFLOAD_GPU;
3652   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3653   /* shorter version of MatAssemblyEnd_SeqAIJ */
3654   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3655   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3656   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3657   a->reallocs         = 0;
3658   A->info.mallocs    += 0;
3659   A->info.nz_unneeded = 0;
3660   A->assembled = A->was_assembled = PETSC_TRUE;
3661   A->num_ass++;
3662   PetscFunctionReturn(0);
3663 }
3664 
3665 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3666 {
3667   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3668   PetscErrorCode     ierr;
3669 
3670   PetscFunctionBegin;
3671   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3672   if (!cusp) PetscFunctionReturn(0);
3673   if (destroy) {
3674     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3675     delete cusp->csr2csc_i;
3676     cusp->csr2csc_i = NULL;
3677   }
3678   A->transupdated = PETSC_FALSE;
3679   PetscFunctionReturn(0);
3680 }
3681 
3682 #include <thrust/binary_search.h>
3683 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3684 {
3685   PetscErrorCode     ierr;
3686   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3687   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3688   PetscInt           cooPerm_n, nzr = 0;
3689   cudaError_t        cerr;
3690 
3691   PetscFunctionBegin;
3692   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3693   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3694   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3695   if (n != cooPerm_n) {
3696     delete cusp->cooPerm;
3697     delete cusp->cooPerm_a;
3698     cusp->cooPerm = NULL;
3699     cusp->cooPerm_a = NULL;
3700   }
3701   if (n) {
3702     THRUSTINTARRAY d_i(n);
3703     THRUSTINTARRAY d_j(n);
3704     THRUSTINTARRAY ii(A->rmap->n);
3705 
3706     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3707     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3708 
3709     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3710     d_i.assign(coo_i,coo_i+n);
3711     d_j.assign(coo_j,coo_j+n);
3712     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3713     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3714 
3715     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3716     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3717     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3718     *cusp->cooPerm_a = d_i;
3719     THRUSTINTARRAY w = d_j;
3720 
3721     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3722     if (nekey == ekey) { /* all entries are unique */
3723       delete cusp->cooPerm_a;
3724       cusp->cooPerm_a = NULL;
3725     } else { /* I couldn't come up with a more elegant algorithm */
3726       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3727       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3728       (*cusp->cooPerm_a)[0] = 0;
3729       w[0] = 0;
3730       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3731       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3732     }
3733     thrust::counting_iterator<PetscInt> search_begin(0);
3734     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3735                         search_begin, search_begin + A->rmap->n,
3736                         ii.begin());
3737     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3738     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3739 
3740     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3741     a->singlemalloc = PETSC_FALSE;
3742     a->free_a       = PETSC_TRUE;
3743     a->free_ij      = PETSC_TRUE;
3744     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3745     a->i[0] = 0;
3746     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3747     a->nz = a->maxnz = a->i[A->rmap->n];
3748     a->rmax = 0;
3749     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3750     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3751     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3752     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3753     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3754     for (PetscInt i = 0; i < A->rmap->n; i++) {
3755       const PetscInt nnzr = a->i[i+1] - a->i[i];
3756       nzr += (PetscInt)!!(nnzr);
3757       a->ilen[i] = a->imax[i] = nnzr;
3758       a->rmax = PetscMax(a->rmax,nnzr);
3759     }
3760     a->nonzerorowcnt = nzr;
3761     A->preallocated = PETSC_TRUE;
3762     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3763     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3764   } else {
3765     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3766   }
3767   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3768 
3769   /* We want to allocate the CUSPARSE struct for matvec now.
3770      The code is so convoluted now that I prefer to copy zeros */
3771   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3772   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3773   A->offloadmask = PETSC_OFFLOAD_CPU;
3774   A->nonzerostate++;
3775   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3776   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3777 
3778   A->assembled = PETSC_FALSE;
3779   A->was_assembled = PETSC_FALSE;
3780   PetscFunctionReturn(0);
3781 }
3782 
3783 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3784 {
3785   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3786   CsrMatrix          *csr;
3787   PetscErrorCode     ierr;
3788 
3789   PetscFunctionBegin;
3790   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3791   PetscValidPointer(a,2);
3792   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3793   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3794   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3795   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3796   csr = (CsrMatrix*)cusp->mat->mat;
3797   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3798   *a = csr->values->data().get();
3799   PetscFunctionReturn(0);
3800 }
3801 
3802 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3803 {
3804   PetscFunctionBegin;
3805   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3806   PetscValidPointer(a,2);
3807   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3808   *a = NULL;
3809   PetscFunctionReturn(0);
3810 }
3811 
3812 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3813 {
3814   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3815   CsrMatrix          *csr;
3816   PetscErrorCode     ierr;
3817 
3818   PetscFunctionBegin;
3819   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3820   PetscValidPointer(a,2);
3821   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3822   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3823   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3824   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3825   csr = (CsrMatrix*)cusp->mat->mat;
3826   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3827   *a = csr->values->data().get();
3828   A->offloadmask = PETSC_OFFLOAD_GPU;
3829   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3830   PetscFunctionReturn(0);
3831 }
3832 
3833 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3834 {
3835   PetscErrorCode ierr;
3836 
3837   PetscFunctionBegin;
3838   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3839   PetscValidPointer(a,2);
3840   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3841   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3842   *a = NULL;
3843   PetscFunctionReturn(0);
3844 }
3845 
3846 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3847 {
3848   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3849   CsrMatrix          *csr;
3850   PetscErrorCode     ierr;
3851 
3852   PetscFunctionBegin;
3853   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3854   PetscValidPointer(a,2);
3855   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3856   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3857   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3858   csr = (CsrMatrix*)cusp->mat->mat;
3859   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3860   *a = csr->values->data().get();
3861   A->offloadmask = PETSC_OFFLOAD_GPU;
3862   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3863   PetscFunctionReturn(0);
3864 }
3865 
3866 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3867 {
3868   PetscErrorCode ierr;
3869 
3870   PetscFunctionBegin;
3871   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3872   PetscValidPointer(a,2);
3873   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3874   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3875   *a = NULL;
3876   PetscFunctionReturn(0);
3877 }
3878 
3879 struct IJCompare4
3880 {
3881   __host__ __device__
3882   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3883   {
3884     if (t1.get<0>() < t2.get<0>()) return true;
3885     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3886     return false;
3887   }
3888 };
3889 
3890 struct Shift
3891 {
3892   int _shift;
3893 
3894   Shift(int shift) : _shift(shift) {}
3895   __host__ __device__
3896   inline int operator() (const int &c)
3897   {
3898     return c + _shift;
3899   }
3900 };
3901 
3902 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3903 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3904 {
3905   PetscErrorCode               ierr;
3906   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3907   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3908   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3909   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3910   PetscInt                     Annz,Bnnz;
3911   cusparseStatus_t             stat;
3912   PetscInt                     i,m,n,zero = 0;
3913   cudaError_t                  cerr;
3914 
3915   PetscFunctionBegin;
3916   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3917   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3918   PetscValidPointer(C,4);
3919   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3920   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3921   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3922   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3923   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3924   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3925   if (reuse == MAT_INITIAL_MATRIX) {
3926     m     = A->rmap->n;
3927     n     = A->cmap->n + B->cmap->n;
3928     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3929     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3930     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3931     c     = (Mat_SeqAIJ*)(*C)->data;
3932     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3933     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3934     Ccsr  = new CsrMatrix;
3935     Cmat->cprowIndices      = NULL;
3936     c->compressedrow.use    = PETSC_FALSE;
3937     c->compressedrow.nrows  = 0;
3938     c->compressedrow.i      = NULL;
3939     c->compressedrow.rindex = NULL;
3940     Ccusp->workVector       = NULL;
3941     Ccusp->nrows    = m;
3942     Ccusp->mat      = Cmat;
3943     Ccusp->mat->mat = Ccsr;
3944     Ccsr->num_rows  = m;
3945     Ccsr->num_cols  = n;
3946     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3947     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3948     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3949     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3950     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3951     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3952     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3953     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3954     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3955     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3956     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3957     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3958     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3959     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3960     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3961 
3962     Acsr = (CsrMatrix*)Acusp->mat->mat;
3963     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3964     Annz = (PetscInt)Acsr->column_indices->size();
3965     Bnnz = (PetscInt)Bcsr->column_indices->size();
3966     c->nz = Annz + Bnnz;
3967     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3968     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3969     Ccsr->values = new THRUSTARRAY(c->nz);
3970     Ccsr->num_entries = c->nz;
3971     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3972     if (c->nz) {
3973       auto Acoo = new THRUSTINTARRAY32(Annz);
3974       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3975       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3976       THRUSTINTARRAY32 *Aroff,*Broff;
3977 
3978       if (a->compressedrow.use) { /* need full row offset */
3979         if (!Acusp->rowoffsets_gpu) {
3980           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3981           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3982           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3983         }
3984         Aroff = Acusp->rowoffsets_gpu;
3985       } else Aroff = Acsr->row_offsets;
3986       if (b->compressedrow.use) { /* need full row offset */
3987         if (!Bcusp->rowoffsets_gpu) {
3988           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3989           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3990           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3991         }
3992         Broff = Bcusp->rowoffsets_gpu;
3993       } else Broff = Bcsr->row_offsets;
3994       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3995       stat = cusparseXcsr2coo(Acusp->handle,
3996                               Aroff->data().get(),
3997                               Annz,
3998                               m,
3999                               Acoo->data().get(),
4000                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4001       stat = cusparseXcsr2coo(Bcusp->handle,
4002                               Broff->data().get(),
4003                               Bnnz,
4004                               m,
4005                               Bcoo->data().get(),
4006                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4007       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4008       auto Aperm = thrust::make_constant_iterator(1);
4009       auto Bperm = thrust::make_constant_iterator(0);
4010 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4011       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4012       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4013 #else
4014       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4015       auto Bcib = Bcsr->column_indices->begin();
4016       auto Bcie = Bcsr->column_indices->end();
4017       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4018 #endif
4019       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4020       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4021       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4022       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4023       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4024       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4025       auto p1 = Ccusp->cooPerm->begin();
4026       auto p2 = Ccusp->cooPerm->begin();
4027       thrust::advance(p2,Annz);
4028       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4029 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4030       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4031 #endif
4032       auto cci = thrust::make_counting_iterator(zero);
4033       auto cce = thrust::make_counting_iterator(c->nz);
4034 #if 0 //Errors on SUMMIT cuda 11.1.0
4035       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4036 #else
4037       auto pred = thrust::identity<int>();
4038       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4039       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4040 #endif
4041       stat = cusparseXcoo2csr(Ccusp->handle,
4042                               Ccoo->data().get(),
4043                               c->nz,
4044                               m,
4045                               Ccsr->row_offsets->data().get(),
4046                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4047       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4048       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4049       delete wPerm;
4050       delete Acoo;
4051       delete Bcoo;
4052       delete Ccoo;
4053 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4054       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4055                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4056                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4057                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4058 #endif
4059       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4060         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4061         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4062         CsrMatrix *CcsrT = new CsrMatrix;
4063         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4064         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4065 
4066         (*C)->form_explicit_transpose = PETSC_TRUE;
4067         (*C)->transupdated = PETSC_TRUE;
4068         Ccusp->rowoffsets_gpu = NULL;
4069         CmatT->cprowIndices = NULL;
4070         CmatT->mat = CcsrT;
4071         CcsrT->num_rows = n;
4072         CcsrT->num_cols = m;
4073         CcsrT->num_entries = c->nz;
4074 
4075         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4076         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4077         CcsrT->values = new THRUSTARRAY(c->nz);
4078 
4079         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4080         auto rT = CcsrT->row_offsets->begin();
4081         if (AT) {
4082           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4083           thrust::advance(rT,-1);
4084         }
4085         if (BT) {
4086           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4087           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4088           thrust::copy(titb,tite,rT);
4089         }
4090         auto cT = CcsrT->column_indices->begin();
4091         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4092         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4093         auto vT = CcsrT->values->begin();
4094         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4095         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4096         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4097         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4098 
4099         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4100         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4101         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4102         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4103         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4104         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4105         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4106         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4107         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4108 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4109         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4110                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4111                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4112                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4113 #endif
4114         Ccusp->matTranspose = CmatT;
4115       }
4116     }
4117 
4118     c->singlemalloc = PETSC_FALSE;
4119     c->free_a       = PETSC_TRUE;
4120     c->free_ij      = PETSC_TRUE;
4121     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4122     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4123     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4124       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4125       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4126       ii   = *Ccsr->row_offsets;
4127       jj   = *Ccsr->column_indices;
4128       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4129       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4130     } else {
4131       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4132       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4133     }
4134     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4135     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4136     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4137     c->maxnz = c->nz;
4138     c->nonzerorowcnt = 0;
4139     c->rmax = 0;
4140     for (i = 0; i < m; i++) {
4141       const PetscInt nn = c->i[i+1] - c->i[i];
4142       c->ilen[i] = c->imax[i] = nn;
4143       c->nonzerorowcnt += (PetscInt)!!nn;
4144       c->rmax = PetscMax(c->rmax,nn);
4145     }
4146     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4147     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4148     (*C)->nonzerostate++;
4149     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4150     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4151     Ccusp->nonzerostate = (*C)->nonzerostate;
4152     (*C)->preallocated  = PETSC_TRUE;
4153   } else {
4154     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4155     c = (Mat_SeqAIJ*)(*C)->data;
4156     if (c->nz) {
4157       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4158       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4159       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4160       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4161       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4162       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4163       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4164       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4165       Acsr = (CsrMatrix*)Acusp->mat->mat;
4166       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4167       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4168       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4169       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4170       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4171       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4172       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4173       auto pmid = Ccusp->cooPerm->begin();
4174       thrust::advance(pmid,Acsr->num_entries);
4175       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4176       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4177                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4178       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4179                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4180       thrust::for_each(zibait,zieait,VecCUDAEquals());
4181       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4182                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4183       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4184                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4185       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4186       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4187       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4188         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4189         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4190         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4191         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4192         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4193         auto vT = CcsrT->values->begin();
4194         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4195         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4196         (*C)->transupdated = PETSC_TRUE;
4197       }
4198       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4199       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4200     }
4201   }
4202   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4203   (*C)->assembled     = PETSC_TRUE;
4204   (*C)->was_assembled = PETSC_FALSE;
4205   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4206   PetscFunctionReturn(0);
4207 }
4208 
4209 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4210 {
4211   PetscErrorCode    ierr;
4212   bool              dmem;
4213   const PetscScalar *av;
4214   cudaError_t       cerr;
4215 
4216   PetscFunctionBegin;
4217   dmem = isCudaMem(v);
4218   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4219   if (n && idx) {
4220     THRUSTINTARRAY widx(n);
4221     widx.assign(idx,idx+n);
4222     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4223 
4224     THRUSTARRAY *w = NULL;
4225     thrust::device_ptr<PetscScalar> dv;
4226     if (dmem) {
4227       dv = thrust::device_pointer_cast(v);
4228     } else {
4229       w = new THRUSTARRAY(n);
4230       dv = w->data();
4231     }
4232     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4233 
4234     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4235     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4236     thrust::for_each(zibit,zieit,VecCUDAEquals());
4237     if (w) {
4238       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4239     }
4240     delete w;
4241   } else {
4242     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4243   }
4244   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4245   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4246   PetscFunctionReturn(0);
4247 }
4248 
4249 /*
4250   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4251 
4252   requires:
4253      structurally symmetric: fix with transpose/column meta data
4254 */
4255 
4256 /*
4257   The GPU LU factor kernel
4258 */
4259 __global__
4260 void __launch_bounds__(1024,1)
4261 mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4262 {
4263   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4264   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4265   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4266 
4267   // set i (row+1)
4268   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4269   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4270   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4271     if (rowb < end_i && threadIdx.x==0) {
4272       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4273       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4274     }
4275   }
4276 }
4277 // copy AIJ to AIJ_BAND
4278 __global__
4279 void __launch_bounds__(1024,1)
4280 mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4281                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4282                                 const int bi_csr[], PetscScalar ba_csr[])
4283 {
4284   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4285   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4286   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4287 
4288   // zero B
4289   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4290   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4291     if (rowb < end_i) {
4292       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4293       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4294       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4295         if (j<nzb) {
4296           batmp[j] = 0;
4297         }
4298       }
4299     }
4300   }
4301 
4302   // copy A into B with CSR format -- these two loops can be fused
4303   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4304     if (rowb < end_i) {
4305       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4306       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4307       const PetscScalar *av    = aa_d + ai_d[rowa];
4308       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4309       /* load in initial (unfactored row) */
4310       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4311         if (j<nza) {
4312           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4313           PetscScalar vala = av[j];
4314           batmp[idx] = vala;
4315         }
4316       }
4317     }
4318   }
4319 }
4320 // print AIJ_BAND
4321 __global__
4322 void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4323 {
4324   // debug
4325   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4326     printf("B (AIJ) n=%d:\n",(int)n);
4327     for (int rowb=0;rowb<n;rowb++) {
4328       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4329       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4330       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4331       printf(" bi=%d\n",bi_csr[rowb+1]);
4332     }
4333   }
4334 }
4335 // Band LU kernel ---  ba_csr bi_csr
4336 __global__
4337 void __launch_bounds__(1024,1)
4338 mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4339 {
4340   extern __shared__ PetscInt smemInt[];
4341   PetscInt        *sm_pkIdx  = &smemInt[0];
4342   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4343   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4344   const PetscInt  start = field*nloc, end = start + nloc;
4345 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4346   auto g = cooperative_groups::this_grid();
4347 #endif
4348   // A22 panel update for each row A(1,:) and col A(:,1)
4349   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4350     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4351     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4352     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4353     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4354     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4355     const PetscScalar Bdd = *pBdd;
4356     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4357     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4358       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4359         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4360         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4361         *Aid = *Aid/Bdd;
4362         sm_pkIdx[threadIdx.y] = kIdx;
4363       }
4364       __syncthreads(); // synch on threadIdx.x only
4365       if (idx < nzUd) { /* assuming symmetric structure */
4366         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4367         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4368         PetscScalar *Aij =  Aid + 1;
4369         PetscScalar Lid  = *Aid;
4370         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4371           if (jIdx<nzUd) {
4372             Aij[jIdx] -= Lid*baUd[jIdx];
4373           }
4374         }
4375       }
4376     }
4377 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4378     g.sync();
4379 #else
4380     __syncthreads();
4381 #endif
4382   } /* endof for (i=0; i<n; i++) { */
4383 }
4384 
4385 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4386 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4387 {
4388   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4389   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4390   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4391   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4392   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4393   CsrMatrix                    *matrixA;
4394   PetscErrorCode               ierr;
4395   cudaError_t                  cerr;
4396   const PetscInt               n=A->rmap->n, *ic, *r;
4397   const int                    *ai_d, *aj_d;
4398   const PetscScalar            *aa_d;
4399   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4400   int                          *bi_t = cusparseTriFactors->i_band_d;
4401   PetscContainer               container;
4402   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4403 
4404   PetscFunctionBegin;
4405   if (A->rmap->n == 0) {
4406     PetscFunctionReturn(0);
4407   }
4408   // cusparse setup
4409   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4410   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4411   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4412   matrixA = (CsrMatrix*)matstructA->mat;
4413   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4414 
4415   // factor: get Nf if available
4416   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4417   if (container) {
4418     PetscInt *pNf=NULL;
4419     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4420     Nf = (*pNf)%1000;
4421     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4422   } else Nf = 1;
4423   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4424 
4425   // get data
4426   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4427   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4428   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4429   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4430   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4431 
4432   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4433   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4434   {
4435     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4436     int gpuid;
4437     cudaDeviceProp prop;
4438     cudaGetDevice(&gpuid);
4439     cudaGetDeviceProperties(&prop, gpuid);
4440 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4441     Ni = 1/nconcurrent;
4442     Ni = 1;
4443 #else
4444     nsm = prop.multiProcessorCount;
4445     Ni = nsm/Nf/nconcurrent;
4446 #endif
4447     team_size = bw/Ni + !!(bw%Ni);
4448     nVec = PetscMin(bw, 1024/team_size);
4449     ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr);
4450     {
4451       dim3 dimBlockTeam(nVec,team_size);
4452       dim3 dimBlockLeague(Nf,Ni);
4453       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4454       CHECK_LAUNCH_ERROR(); // does a sync
4455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4456       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4457       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4458 #else
4459       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4460 #endif
4461       CHECK_LAUNCH_ERROR(); // does a sync
4462 #if defined(PETSC_USE_LOG)
4463       ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr);
4464 #endif
4465     }
4466   }
4467   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4468 
4469   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4470   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4471   B->ops->solvetranspose = NULL; // need transpose
4472   B->ops->matsolve = NULL;
4473   B->ops->matsolvetranspose = NULL;
4474 
4475   PetscFunctionReturn(0);
4476 }
4477 
4478 static PetscErrorCode MatrixNfDestroy(void *ptr)
4479 {
4480   PetscInt *nf = (PetscInt *)ptr;
4481   PetscErrorCode  ierr;
4482   PetscFunctionBegin;
4483   ierr = PetscFree(nf);CHKERRQ(ierr);
4484   PetscFunctionReturn(0);
4485 }
4486 
4487 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4488 {
4489   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4490   IS                 isicol;
4491   PetscErrorCode     ierr;
4492   cudaError_t        cerr;
4493   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4494   PetscScalar        *ba_t;
4495   int                *bi_t;
4496   PetscInt           i,n=A->rmap->n,Nf;
4497   PetscInt           nzBcsr,bwL,bwU;
4498   PetscBool          missing;
4499   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4500   PetscContainer               container;
4501 
4502   PetscFunctionBegin;
4503   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4504   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
4505   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4506   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4507   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr);
4508   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4509 
4510    // factor: get Nf if available
4511   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4512   if (container) {
4513     PetscInt *pNf=NULL;
4514     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4515     Nf = (*pNf)%1000;
4516     ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr);
4517     ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr);
4518     *pNf = Nf;
4519     ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr);
4520     ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr);
4521     ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr);
4522     ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
4523   } else Nf = 1;
4524   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4525 
4526   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4527   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4528 
4529   ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4530   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr);
4531   b    = (Mat_SeqAIJ*)(B)->data;
4532 
4533   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4534   bwL = bwU = 0;
4535   for (int rwb=0; rwb<n; rwb++) {
4536     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4537     for (int j=0;j<anz;j++) {
4538       PetscInt colb = ic[ajtmp[j]];
4539       if (colb<rwa) { // L
4540         if (rwa-colb > bwL) bwL = rwa-colb;
4541       } else {
4542         if (colb-rwa > bwU) bwU = colb-rwa;
4543       }
4544     }
4545   }
4546   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4547   /* only support structurally symmetric, but it might work */
4548   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4549   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
4550   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4551   b->maxnz = b->nz = nzBcsr;
4552   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4553   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4554   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4555   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4556   cusparseTriFactors->a_band_d = ba_t;
4557   cusparseTriFactors->i_band_d = bi_t;
4558   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4559   ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr);
4560   {
4561     dim3 dimBlockTeam(1,128);
4562     dim3 dimBlockLeague(Nf,1);
4563     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4564   }
4565   CHECK_LAUNCH_ERROR(); // does a sync
4566 
4567   // setup data
4568   if (!cusparseTriFactors->rpermIndices) {
4569     const PetscInt *r;
4570 
4571     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4572     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4573     cusparseTriFactors->rpermIndices->assign(r, r+n);
4574     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4575     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4576   }
4577   /* upper triangular indices */
4578   if (!cusparseTriFactors->cpermIndices) {
4579     const PetscInt *c;
4580 
4581     ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr);
4582     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4583     cusparseTriFactors->cpermIndices->assign(c, c+n);
4584     ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr);
4585     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4586   }
4587 
4588   /* put together the new matrix */
4589   b->free_a       = PETSC_FALSE;
4590   b->free_ij      = PETSC_FALSE;
4591   b->singlemalloc = PETSC_FALSE;
4592   b->ilen = NULL;
4593   b->imax = NULL;
4594   b->row  = isrow;
4595   b->col  = iscol;
4596   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4597   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4598   b->icol = isicol;
4599   ierr    = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr);
4600 
4601   B->factortype            = MAT_FACTOR_LU;
4602   B->info.factor_mallocs   = 0;
4603   B->info.fill_ratio_given = 0;
4604 
4605   if (ai[n]) {
4606     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4607   } else {
4608     B->info.fill_ratio_needed = 0.0;
4609   }
4610 #if defined(PETSC_USE_INFO)
4611   if (ai[n] != 0) {
4612     PetscReal af = B->info.fill_ratio_needed;
4613     ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr);
4614   } else {
4615     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
4616   }
4617 #endif
4618   if (a->inode.size) {
4619     ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr);
4620   }
4621   ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr);
4622   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4623   B->offloadmask = PETSC_OFFLOAD_GPU;
4624 
4625   PetscFunctionReturn(0);
4626 }
4627 
4628 /* Use -pc_factor_mat_solver_type cusparseband */
4629 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4630 {
4631   PetscFunctionBegin;
4632   *type = MATSOLVERCUSPARSEBAND;
4633   PetscFunctionReturn(0);
4634 }
4635 
4636 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4637 {
4638   PetscErrorCode ierr;
4639   PetscInt       n = A->rmap->n;
4640 
4641   PetscFunctionBegin;
4642   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
4643   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
4644   (*B)->factortype = ftype;
4645   (*B)->canuseordering = PETSC_TRUE;
4646   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4647 
4648   if (ftype == MAT_FACTOR_LU) {
4649     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
4650     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4651     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4652   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4653 
4654   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4655   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr);
4656   PetscFunctionReturn(0);
4657 }
4658 
4659 #define WARP_SIZE 32
4660 template <typename T>
4661 __forceinline__ __device__
4662 T wreduce(T a)
4663 {
4664   T b;
4665   #pragma unroll
4666   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4667     b = __shfl_down_sync(0xffffffff, a, i);
4668     a += b;
4669   }
4670   return a;
4671 }
4672 // reduce in a block, returns result in thread 0
4673 template <typename T, int BLOCK_SIZE>
4674 __device__
4675 T breduce(T a)
4676 {
4677   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4678   __shared__ double buf[NWARP];
4679   int wid = threadIdx.x / WARP_SIZE;
4680   int laneid = threadIdx.x % WARP_SIZE;
4681   T b = wreduce<T>(a);
4682   if (laneid == 0)
4683     buf[wid] = b;
4684   __syncthreads();
4685   if (wid == 0) {
4686     if (threadIdx.x < NWARP)
4687       a = buf[threadIdx.x];
4688     else
4689       a = 0;
4690     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4691       a += __shfl_down_sync(0xffffffff, a, i);
4692     }
4693   }
4694   return a;
4695 }
4696 
4697 
4698 // Band LU kernel ---  ba_csr bi_csr
4699 template <int BLOCK_SIZE>
4700 __global__
4701 void __launch_bounds__(256,1)
4702 mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4703 {
4704   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4705   const PetscScalar *pLi;
4706   const int tid = threadIdx.x;
4707 
4708   /* Next, solve L */
4709   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4710   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4711     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4712     PetscScalar t = 0;
4713     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4714       t += pLi[idx]*x[j];
4715     }
4716 #if defined(PETSC_USE_COMPLEX)
4717     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4718     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4719     t = tt;
4720 #else
4721     t = breduce<PetscReal,BLOCK_SIZE>(t);
4722 #endif
4723     if (threadIdx.x == 0)
4724       x[glbDD] -= t; // /1.0
4725     __syncthreads();
4726     // inc
4727     pLi += glbDD-col; // get to diagonal
4728     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4729     else pLi += bw;
4730     pLi += 1; // skip to next row
4731     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4732   }
4733   /* Then, solve U */
4734   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4735   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4736   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4737     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4738     PetscScalar t = 0;
4739     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4740       t += pLi[-idx]*x[j];
4741     }
4742 #if defined(PETSC_USE_COMPLEX)
4743     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4744     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4745     t = tt;
4746 #else
4747     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4748 #endif
4749     pLi -= col-glbDD; // diagonal
4750     if (threadIdx.x == 0) {
4751       x[glbDD] -= t;
4752       x[glbDD] /= pLi[0];
4753     }
4754     __syncthreads();
4755     // inc past L to start of previous U
4756     pLi -= bw+1;
4757     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4758     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4759   }
4760 }
4761 
4762 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4763 {
4764   const PetscScalar                     *barray;
4765   PetscScalar                           *xarray;
4766   thrust::device_ptr<const PetscScalar> bGPU;
4767   thrust::device_ptr<PetscScalar>       xGPU;
4768   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4769   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4770   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4771   PetscErrorCode                        ierr;
4772   cudaError_t                           cerr;
4773   PetscContainer                        container;
4774 
4775   PetscFunctionBegin;
4776   if (A->rmap->n == 0) {
4777     PetscFunctionReturn(0);
4778   }
4779   // factor: get Nf if available
4780   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4781   if (container) {
4782     PetscInt *pNf=NULL;
4783     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4784     Nf = (*pNf)%1000;
4785   } else Nf = 1;
4786   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4787 
4788   /* Get the GPU pointers */
4789   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
4790   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
4791   xGPU = thrust::device_pointer_cast(xarray);
4792   bGPU = thrust::device_pointer_cast(barray);
4793 
4794   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4795   /* First, reorder with the row permutation */
4796   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4797                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4798                tempGPU->begin());
4799   constexpr int block = 128;
4800   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4801   CHECK_LAUNCH_ERROR(); // does a sync
4802 
4803   /* Last, reorder with the column permutation */
4804   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4805                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4806                xGPU);
4807 
4808   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
4809   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
4810   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4811   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4812   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
4813   PetscFunctionReturn(0);
4814 }
4815