xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 8a53a0a473cbd886d528d66811e6ee7fc8c83ebf)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18 #include <cooperative_groups.h>
19 #endif
20 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
24 
25   typedef enum {
26       CUSPARSE_MV_ALG_DEFAULT = 0,
27       CUSPARSE_COOMV_ALG      = 1,
28       CUSPARSE_CSRMV_ALG1     = 2,
29       CUSPARSE_CSRMV_ALG2     = 3
30   } cusparseSpMVAlg_t;
31 
32   typedef enum {
33       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
35       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
36       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
37       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
38       CUSPARSE_SPMM_ALG_DEFAULT = 0,
39       CUSPARSE_SPMM_COO_ALG1    = 1,
40       CUSPARSE_SPMM_COO_ALG2    = 2,
41       CUSPARSE_SPMM_COO_ALG3    = 3,
42       CUSPARSE_SPMM_COO_ALG4    = 5,
43       CUSPARSE_SPMM_CSR_ALG1    = 4,
44       CUSPARSE_SPMM_CSR_ALG2    = 6,
45   } cusparseSpMMAlg_t;
46 
47   typedef enum {
48       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
50   } cusparseCsr2CscAlg_t;
51   */
52   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55 #endif
56 
57 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
60 
61 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
92 
93 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
94 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
95 
96 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
97 
98 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99 {
100   cusparseStatus_t   stat;
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102 
103   PetscFunctionBegin;
104   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105   cusparsestruct->stream = stream;
106   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107   PetscFunctionReturn(0);
108 }
109 
110 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111 {
112   cusparseStatus_t   stat;
113   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114 
115   PetscFunctionBegin;
116   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117   if (cusparsestruct->handle != handle) {
118     if (cusparsestruct->handle) {
119       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120     }
121     cusparsestruct->handle = handle;
122   }
123   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128 {
129   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130   PetscBool          flg;
131   PetscErrorCode     ierr;
132 
133   PetscFunctionBegin;
134   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
135   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
136   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137   PetscFunctionReturn(0);
138 }
139 
140 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141 {
142   PetscFunctionBegin;
143   *type = MATSOLVERCUSPARSE;
144   PetscFunctionReturn(0);
145 }
146 
147 /*MC
148   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153   algorithms are not recommended. This class does NOT support direct solver operations.
154 
155   Level: beginner
156 
157 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 M*/
159 
160 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161 {
162   PetscErrorCode ierr;
163   PetscInt       n = A->rmap->n;
164 
165   PetscFunctionBegin;
166   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
167   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
168   (*B)->factortype = ftype;
169   (*B)->useordering = PETSC_TRUE;
170   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
171 
172   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
173     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
174     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
179   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
180 
181   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
182   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187 {
188   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
189 
190   PetscFunctionBegin;
191   switch (op) {
192   case MAT_CUSPARSE_MULT:
193     cusparsestruct->format = format;
194     break;
195   case MAT_CUSPARSE_ALL:
196     cusparsestruct->format = format;
197     break;
198   default:
199     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 /*@
205    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206    operation. Only the MatMult operation can use different GPU storage formats
207    for MPIAIJCUSPARSE matrices.
208    Not Collective
209 
210    Input Parameters:
211 +  A - Matrix of type SEQAIJCUSPARSE
212 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214 
215    Output Parameter:
216 
217    Level: intermediate
218 
219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220 @*/
221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222 {
223   PetscErrorCode ierr;
224 
225   PetscFunctionBegin;
226   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
227   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   switch (op) {
237     case MAT_FORM_EXPLICIT_TRANSPOSE:
238       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
240       A->form_explicit_transpose = flg;
241       break;
242     default:
243       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
244       break;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250 
251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252 {
253   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254   IS             isrow = b->row,iscol = b->col;
255   PetscBool      row_identity,col_identity;
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
260   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
261   B->offloadmask = PETSC_OFFLOAD_CPU;
262   /* determine which version of MatSolve needs to be used. */
263   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
264   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
265   if (row_identity && col_identity) {
266     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268     B->ops->matsolve = NULL;
269     B->ops->matsolvetranspose = NULL;
270   } else {
271     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273     B->ops->matsolve = NULL;
274     B->ops->matsolvetranspose = NULL;
275   }
276 
277   /* get the triangular factors */
278   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
279   PetscFunctionReturn(0);
280 }
281 
282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283 {
284   PetscErrorCode           ierr;
285   MatCUSPARSEStorageFormat format;
286   PetscBool                flg;
287   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
288 
289   PetscFunctionBegin;
290   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
291   if (A->factortype == MAT_FACTOR_NONE) {
292     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
294     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
295 
296     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
298     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
299    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
302     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 
305     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
307     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
308 
309     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
311     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312    #endif
313   }
314   ierr = PetscOptionsTail();CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319 {
320   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321   PetscErrorCode               ierr;
322 
323   PetscFunctionBegin;
324   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
325   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
326   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327   PetscFunctionReturn(0);
328 }
329 
330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331 {
332   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333   PetscErrorCode               ierr;
334 
335   PetscFunctionBegin;
336   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
337   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
338   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339   PetscFunctionReturn(0);
340 }
341 
342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343 {
344   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345   PetscErrorCode               ierr;
346 
347   PetscFunctionBegin;
348   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
349   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
350   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351   PetscFunctionReturn(0);
352 }
353 
354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355 {
356   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357   PetscErrorCode               ierr;
358 
359   PetscFunctionBegin;
360   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
361   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
362   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363   PetscFunctionReturn(0);
364 }
365 
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiLo, *AjLo;
376   PetscInt                          i,nz, nzLower, offset, rowOffset;
377   PetscErrorCode                    ierr;
378   cudaError_t                       cerr;
379 
380   PetscFunctionBegin;
381   if (!n) PetscFunctionReturn(0);
382   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383     try {
384       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385       nzLower=n+ai[n]-ai[1];
386       if (!loTriFactor) {
387         PetscScalar                       *AALo;
388 
389         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
390 
391         /* Allocate Space for the lower triangular matrix */
392         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
394 
395         /* Fill the lower triangular matrix */
396         AiLo[0]  = (PetscInt) 0;
397         AiLo[n]  = nzLower;
398         AjLo[0]  = (PetscInt) 0;
399         AALo[0]  = (MatScalar) 1.0;
400         v        = aa;
401         vi       = aj;
402         offset   = 1;
403         rowOffset= 1;
404         for (i=1; i<n; i++) {
405           nz = ai[i+1] - ai[i];
406           /* additional 1 for the term on the diagonal */
407           AiLo[i]    = rowOffset;
408           rowOffset += nz+1;
409 
410           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
411           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
412 
413           offset      += nz;
414           AjLo[offset] = (PetscInt) i;
415           AALo[offset] = (MatScalar) 1.0;
416           offset      += 1;
417 
418           v  += nz;
419           vi += nz;
420         }
421 
422         /* allocate space for the triangular factor information */
423         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
424         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425         /* Create the matrix description */
426         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430        #else
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432        #endif
433         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
435 
436         /* set the operation */
437         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439         /* set the matrix */
440         loTriFactor->csrMat = new CsrMatrix;
441         loTriFactor->csrMat->num_rows = n;
442         loTriFactor->csrMat->num_cols = n;
443         loTriFactor->csrMat->num_entries = nzLower;
444 
445         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
447 
448         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
450 
451         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
453 
454         /* Create the solve analysis information */
455         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
456         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464       #endif
465 
466         /* perform the solve analysis */
467         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473                                #endif
474 );CHKERRCUSPARSE(stat);
475         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
477 
478         /* assign the pointer */
479         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480         loTriFactor->AA_h = AALo;
481         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       } else { /* update values only */
485         if (!loTriFactor->AA_h) {
486           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487         }
488         /* Fill the lower triangular matrix */
489         loTriFactor->AA_h[0]  = 1.0;
490         v        = aa;
491         vi       = aj;
492         offset   = 1;
493         for (i=1; i<n; i++) {
494           nz = ai[i+1] - ai[i];
495           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
496           offset      += nz;
497           loTriFactor->AA_h[offset] = 1.0;
498           offset      += 1;
499           v  += nz;
500         }
501         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
503       }
504     } catch(char *ex) {
505       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506     }
507   }
508   PetscFunctionReturn(0);
509 }
510 
511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512 {
513   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514   PetscInt                          n = A->rmap->n;
515   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517   cusparseStatus_t                  stat;
518   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519   const MatScalar                   *aa = a->a,*v;
520   PetscInt                          *AiUp, *AjUp;
521   PetscInt                          i,nz, nzUpper, offset;
522   PetscErrorCode                    ierr;
523   cudaError_t                       cerr;
524 
525   PetscFunctionBegin;
526   if (!n) PetscFunctionReturn(0);
527   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528     try {
529       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530       nzUpper = adiag[0]-adiag[n];
531       if (!upTriFactor) {
532         PetscScalar *AAUp;
533 
534         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
535 
536         /* Allocate Space for the upper triangular matrix */
537         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
539 
540         /* Fill the upper triangular matrix */
541         AiUp[0]=(PetscInt) 0;
542         AiUp[n]=nzUpper;
543         offset = nzUpper;
544         for (i=n-1; i>=0; i--) {
545           v  = aa + adiag[i+1] + 1;
546           vi = aj + adiag[i+1] + 1;
547 
548           /* number of elements NOT on the diagonal */
549           nz = adiag[i] - adiag[i+1]-1;
550 
551           /* decrement the offset */
552           offset -= (nz+1);
553 
554           /* first, set the diagonal elements */
555           AjUp[offset] = (PetscInt) i;
556           AAUp[offset] = (MatScalar)1./v[nz];
557           AiUp[i]      = AiUp[i+1] - (nz+1);
558 
559           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
560           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
561         }
562 
563         /* allocate space for the triangular factor information */
564         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
565         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
566 
567         /* Create the matrix description */
568         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572        #else
573         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574        #endif
575         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
577 
578         /* set the operation */
579         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580 
581         /* set the matrix */
582         upTriFactor->csrMat = new CsrMatrix;
583         upTriFactor->csrMat->num_rows = n;
584         upTriFactor->csrMat->num_cols = n;
585         upTriFactor->csrMat->num_entries = nzUpper;
586 
587         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589 
590         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592 
593         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595 
596         /* Create the solve analysis information */
597         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
598         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606       #endif
607 
608         /* perform the solve analysis */
609         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615                                #endif
616 );CHKERRCUSPARSE(stat);
617         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
619 
620         /* assign the pointer */
621         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622         upTriFactor->AA_h = AAUp;
623         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
626       } else {
627         if (!upTriFactor->AA_h) {
628           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629         }
630         /* Fill the upper triangular matrix */
631         offset = nzUpper;
632         for (i=n-1; i>=0; i--) {
633           v  = aa + adiag[i+1] + 1;
634 
635           /* number of elements NOT on the diagonal */
636           nz = adiag[i] - adiag[i+1]-1;
637 
638           /* decrement the offset */
639           offset -= (nz+1);
640 
641           /* first, set the diagonal elements */
642           upTriFactor->AA_h[offset] = 1./v[nz];
643           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
644         }
645         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
647       }
648     } catch(char *ex) {
649       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650     }
651   }
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656 {
657   PetscErrorCode               ierr;
658   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660   IS                           isrow = a->row,iscol = a->icol;
661   PetscBool                    row_identity,col_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
667   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
668 
669   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670   cusparseTriFactors->nnz=a->nz;
671 
672   A->offloadmask = PETSC_OFFLOAD_BOTH;
673   /* lower triangular indices */
674   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
675   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676     const PetscInt *r;
677 
678     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
679     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680     cusparseTriFactors->rpermIndices->assign(r, r+n);
681     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
682     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
683   }
684 
685   /* upper triangular indices */
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688     const PetscInt *c;
689 
690     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
691     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692     cusparseTriFactors->cpermIndices->assign(c, c+n);
693     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
694     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700 {
701   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705   cusparseStatus_t                  stat;
706   PetscErrorCode                    ierr;
707   cudaError_t                       cerr;
708   PetscInt                          *AiUp, *AjUp;
709   PetscScalar                       *AAUp;
710   PetscScalar                       *AALo;
711   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714   const MatScalar                   *aa = b->a,*v;
715 
716   PetscFunctionBegin;
717   if (!n) PetscFunctionReturn(0);
718   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719     try {
720       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722       if (!upTriFactor && !loTriFactor) {
723         /* Allocate Space for the upper triangular matrix */
724         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
726 
727         /* Fill the upper triangular matrix */
728         AiUp[0]=(PetscInt) 0;
729         AiUp[n]=nzUpper;
730         offset = 0;
731         for (i=0; i<n; i++) {
732           /* set the pointers */
733           v  = aa + ai[i];
734           vj = aj + ai[i];
735           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
736 
737           /* first, set the diagonal elements */
738           AjUp[offset] = (PetscInt) i;
739           AAUp[offset] = (MatScalar)1.0/v[nz];
740           AiUp[i]      = offset;
741           AALo[offset] = (MatScalar)1.0/v[nz];
742 
743           offset+=1;
744           if (nz>0) {
745             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
746             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
747             for (j=offset; j<offset+nz; j++) {
748               AAUp[j] = -AAUp[j];
749               AALo[j] = AAUp[j]/v[nz];
750             }
751             offset+=nz;
752           }
753         }
754 
755         /* allocate space for the triangular factor information */
756         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
757         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
758 
759         /* Create the matrix description */
760         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764        #else
765         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766        #endif
767         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
769 
770         /* set the matrix */
771         upTriFactor->csrMat = new CsrMatrix;
772         upTriFactor->csrMat->num_rows = A->rmap->n;
773         upTriFactor->csrMat->num_cols = A->cmap->n;
774         upTriFactor->csrMat->num_entries = a->nz;
775 
776         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
778 
779         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
781 
782         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
784 
785         /* set the operation */
786         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
787 
788         /* Create the solve analysis information */
789         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
790         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798       #endif
799 
800         /* perform the solve analysis */
801         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807                                 #endif
808 );CHKERRCUSPARSE(stat);
809         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
811 
812         /* assign the pointer */
813         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814 
815         /* allocate space for the triangular factor information */
816         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
817         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818 
819         /* Create the matrix description */
820         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824        #else
825         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826        #endif
827         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
829 
830         /* set the operation */
831         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832 
833         /* set the matrix */
834         loTriFactor->csrMat = new CsrMatrix;
835         loTriFactor->csrMat->num_rows = A->rmap->n;
836         loTriFactor->csrMat->num_cols = A->cmap->n;
837         loTriFactor->csrMat->num_entries = a->nz;
838 
839         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847 
848         /* Create the solve analysis information */
849         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
850         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858       #endif
859 
860         /* perform the solve analysis */
861         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867                                 #endif
868 );CHKERRCUSPARSE(stat);
869         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
871 
872         /* assign the pointer */
873         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
874 
875         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
876         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878       } else {
879         /* Fill the upper triangular matrix */
880         offset = 0;
881         for (i=0; i<n; i++) {
882           /* set the pointers */
883           v  = aa + ai[i];
884           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
885 
886           /* first, set the diagonal elements */
887           AAUp[offset] = 1.0/v[nz];
888           AALo[offset] = 1.0/v[nz];
889 
890           offset+=1;
891           if (nz>0) {
892             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
893             for (j=offset; j<offset+nz; j++) {
894               AAUp[j] = -AAUp[j];
895               AALo[j] = AAUp[j]/v[nz];
896             }
897             offset+=nz;
898           }
899         }
900         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
905       }
906       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908     } catch(char *ex) {
909       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910     }
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916 {
917   PetscErrorCode               ierr;
918   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920   IS                           ip = a->row;
921   PetscBool                    perm_identity;
922   PetscInt                     n = A->rmap->n;
923 
924   PetscFunctionBegin;
925   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
927   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
929 
930   A->offloadmask = PETSC_OFFLOAD_BOTH;
931 
932   /* lower triangular indices */
933   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
934   if (!perm_identity) {
935     IS             iip;
936     const PetscInt *irip,*rip;
937 
938     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
939     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
940     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
941     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
946     ierr = ISDestroy(&iip);CHKERRQ(ierr);
947     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
948     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
949   }
950   PetscFunctionReturn(0);
951 }
952 
953 #define CHECK_LAUNCH_ERROR()                                                             \
954 do {                                                                                     \
955   /* Check synchronous errors, i.e. pre-launch */                                        \
956   cudaError_t err = cudaGetLastError();                                                  \
957   if (cudaSuccess != err) {                                                              \
958     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959   }                                                                                      \
960   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
961   err = cudaDeviceSynchronize();                                                         \
962   if (cudaSuccess != err) {                                                              \
963     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964   }                                                                                      \
965  } while (0)
966 
967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968 {
969   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970   IS             ip = b->row;
971   PetscBool      perm_identity;
972   PetscErrorCode ierr;
973 
974   PetscFunctionBegin;
975   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977   B->offloadmask = PETSC_OFFLOAD_CPU;
978   /* determine which version of MatSolve needs to be used. */
979   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980   if (perm_identity) {
981     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->matsolve = NULL;
984     B->ops->matsolvetranspose = NULL;
985   } else {
986     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988     B->ops->matsolve = NULL;
989     B->ops->matsolvetranspose = NULL;
990   }
991 
992   /* get the triangular factors */
993   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998 {
999   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004   cusparseStatus_t                  stat;
1005   cusparseIndexBase_t               indexBase;
1006   cusparseMatrixType_t              matrixType;
1007   cusparseFillMode_t                fillMode;
1008   cusparseDiagType_t                diagType;
1009   cudaError_t                       cerr;
1010   PetscErrorCode                    ierr;
1011 
1012   PetscFunctionBegin;
1013   /* allocate space for the transpose of the lower triangular factor */
1014   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016 
1017   /* set the matrix descriptors of the lower triangular factor */
1018   matrixType = cusparseGetMatType(loTriFactor->descr);
1019   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023 
1024   /* Create the matrix description */
1025   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030 
1031   /* set the operation */
1032   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033 
1034   /* allocate GPU space for the CSC of the lower triangular factor*/
1035   loTriFactorT->csrMat = new CsrMatrix;
1036   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042 
1043   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                        loTriFactor->csrMat->values->data().get(),
1048                                        loTriFactor->csrMat->row_offsets->data().get(),
1049                                        loTriFactor->csrMat->column_indices->data().get(),
1050                                        loTriFactorT->csrMat->values->data().get(),
1051                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055 #endif
1056 
1057   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060                           loTriFactor->csrMat->values->data().get(),
1061                           loTriFactor->csrMat->row_offsets->data().get(),
1062                           loTriFactor->csrMat->column_indices->data().get(),
1063                           loTriFactorT->csrMat->values->data().get(),
1064                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068                         #else
1069                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070                           CUSPARSE_ACTION_NUMERIC, indexBase
1071                         #endif
1072 );CHKERRCUSPARSE(stat);
1073   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075 
1076   /* Create the solve analysis information */
1077   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086 #endif
1087 
1088   /* perform the solve analysis */
1089   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095                           #endif
1096 );CHKERRCUSPARSE(stat);
1097   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099 
1100   /* assign the pointer */
1101   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102 
1103   /*********************************************/
1104   /* Now the Transpose of the Upper Tri Factor */
1105   /*********************************************/
1106 
1107   /* allocate space for the transpose of the upper triangular factor */
1108   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110 
1111   /* set the matrix descriptors of the upper triangular factor */
1112   matrixType = cusparseGetMatType(upTriFactor->descr);
1113   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117 
1118   /* Create the matrix description */
1119   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124 
1125   /* set the operation */
1126   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127 
1128   /* allocate GPU space for the CSC of the upper triangular factor*/
1129   upTriFactorT->csrMat = new CsrMatrix;
1130   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136 
1137   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                 upTriFactor->csrMat->values->data().get(),
1142                                 upTriFactor->csrMat->row_offsets->data().get(),
1143                                 upTriFactor->csrMat->column_indices->data().get(),
1144                                 upTriFactorT->csrMat->values->data().get(),
1145                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149 #endif
1150 
1151   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154                           upTriFactor->csrMat->values->data().get(),
1155                           upTriFactor->csrMat->row_offsets->data().get(),
1156                           upTriFactor->csrMat->column_indices->data().get(),
1157                           upTriFactorT->csrMat->values->data().get(),
1158                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162                         #else
1163                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                           CUSPARSE_ACTION_NUMERIC, indexBase
1165                         #endif
1166 );CHKERRCUSPARSE(stat);
1167   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169 
1170   /* Create the solve analysis information */
1171   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180   #endif
1181 
1182   /* perform the solve analysis */
1183   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189                           #endif
1190 );CHKERRCUSPARSE(stat);
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 struct PetscScalarToPetscInt
1200 {
1201   __host__ __device__
1202   PetscInt operator()(PetscScalar s)
1203   {
1204     return (PetscInt)PetscRealPart(s);
1205   }
1206 };
1207 
1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209 {
1210   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213   cusparseStatus_t             stat;
1214   cusparseIndexBase_t          indexBase;
1215   cudaError_t                  err;
1216   PetscErrorCode               ierr;
1217 
1218   PetscFunctionBegin;
1219   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1220   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1221   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225   if (A->transupdated) PetscFunctionReturn(0);
1226   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1227   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229   }
1230   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236 
1237     /* set alpha and beta */
1238     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244 
1245     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246       CsrMatrix *matrixT = new CsrMatrix;
1247       matstructT->mat = matrixT;
1248       matrixT->num_rows = A->cmap->n;
1249       matrixT->num_cols = A->rmap->n;
1250       matrixT->num_entries = a->nz;
1251       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253       matrixT->values = new THRUSTARRAY(a->nz);
1254 
1255       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257 
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       stat = cusparseCreateCsr(&matstructT->matDescr,
1260                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262                                matrixT->values->data().get(),
1263                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265      #endif
1266     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269    #else
1270       CsrMatrix *temp  = new CsrMatrix;
1271       CsrMatrix *tempT = new CsrMatrix;
1272       /* First convert HYB to CSR */
1273       temp->num_rows = A->rmap->n;
1274       temp->num_cols = A->cmap->n;
1275       temp->num_entries = a->nz;
1276       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278       temp->values = new THRUSTARRAY(a->nz);
1279 
1280       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282                               temp->values->data().get(),
1283                               temp->row_offsets->data().get(),
1284                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1285 
1286       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287       tempT->num_rows = A->rmap->n;
1288       tempT->num_cols = A->cmap->n;
1289       tempT->num_entries = a->nz;
1290       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292       tempT->values = new THRUSTARRAY(a->nz);
1293 
1294       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295                               temp->num_cols, temp->num_entries,
1296                               temp->values->data().get(),
1297                               temp->row_offsets->data().get(),
1298                               temp->column_indices->data().get(),
1299                               tempT->values->data().get(),
1300                               tempT->column_indices->data().get(),
1301                               tempT->row_offsets->data().get(),
1302                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1303 
1304       /* Last, convert CSC to HYB */
1305       cusparseHybMat_t hybMat;
1306       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310                               matstructT->descr, tempT->values->data().get(),
1311                               tempT->row_offsets->data().get(),
1312                               tempT->column_indices->data().get(),
1313                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1314 
1315       /* assign the pointer */
1316       matstructT->mat = hybMat;
1317       A->transupdated = PETSC_TRUE;
1318       /* delete temporaries */
1319       if (tempT) {
1320         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323         delete (CsrMatrix*) tempT;
1324       }
1325       if (temp) {
1326         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329         delete (CsrMatrix*) temp;
1330       }
1331      #endif
1332     }
1333   }
1334   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1349     }
1350     if (!cusparsestruct->csr2csc_i) {
1351       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1353 
1354       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356       void   *csr2cscBuffer;
1357       size_t csr2cscBufferSize;
1358       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359                                            A->cmap->n, matrix->num_entries,
1360                                            matrix->values->data().get(),
1361                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362                                            matrix->column_indices->data().get(),
1363                                            matrixT->values->data().get(),
1364                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368      #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1379                               A->cmap->n,matrix->num_entries,
1380                               csr2csc_a.data().get(),
1381                               cusparsestruct->rowoffsets_gpu->data().get(),
1382                               matrix->column_indices->data().get(),
1383                               matrixT->values->data().get(),
1384                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386                               CUSPARSE_ACTION_NUMERIC,indexBase,
1387                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388                              #else
1389                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1390                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391                              #endif
1392       } else {
1393         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1394       }
1395 
1396       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400      #endif
1401     }
1402     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404                                                      matrixT->values->begin()));
1405   }
1406   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1407   /* the compressed row indices is not used for matTranspose */
1408   matstructT->cprowIndices = NULL;
1409   /* assign the pointer */
1410   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1411   A->transupdated = PETSC_TRUE;
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1416 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417 {
1418   PetscInt                              n = xx->map->n;
1419   const PetscScalar                     *barray;
1420   PetscScalar                           *xarray;
1421   thrust::device_ptr<const PetscScalar> bGPU;
1422   thrust::device_ptr<PetscScalar>       xGPU;
1423   cusparseStatus_t                      stat;
1424   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428   PetscErrorCode                        ierr;
1429   cudaError_t                           cerr;
1430 
1431   PetscFunctionBegin;
1432   /* Analyze the matrix and create the transpose ... on the fly */
1433   if (!loTriFactorT && !upTriFactorT) {
1434     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1435     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437   }
1438 
1439   /* Get the GPU pointers */
1440   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1441   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1442   xGPU = thrust::device_pointer_cast(xarray);
1443   bGPU = thrust::device_pointer_cast(barray);
1444 
1445   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1446   /* First, reorder with the row permutation */
1447   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449                xGPU);
1450 
1451   /* First, solve U */
1452   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453                         upTriFactorT->csrMat->num_rows,
1454                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455                         upTriFactorT->csrMat->num_entries,
1456                       #endif
1457                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458                         upTriFactorT->csrMat->values->data().get(),
1459                         upTriFactorT->csrMat->row_offsets->data().get(),
1460                         upTriFactorT->csrMat->column_indices->data().get(),
1461                         upTriFactorT->solveInfo,
1462                         xarray, tempGPU->data().get()
1463                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465                       #endif
1466 );CHKERRCUSPARSE(stat);
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(), xarray
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482                       #endif
1483 );CHKERRCUSPARSE(stat);
1484 
1485   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488                tempGPU->begin());
1489 
1490   /* Copy the temporary to the full solution. */
1491   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1492 
1493   /* restore */
1494   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1495   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1496   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1498   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1499   PetscFunctionReturn(0);
1500 }
1501 
1502 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503 {
1504   const PetscScalar                 *barray;
1505   PetscScalar                       *xarray;
1506   cusparseStatus_t                  stat;
1507   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511   PetscErrorCode                    ierr;
1512   cudaError_t                       cerr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray, tempGPU->data().get()
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541                       #endif
1542 );CHKERRCUSPARSE(stat);
1543 
1544   /* Then, solve L */
1545   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546                         loTriFactorT->csrMat->num_rows,
1547                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548                         loTriFactorT->csrMat->num_entries,
1549                       #endif
1550                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551                         loTriFactorT->csrMat->values->data().get(),
1552                         loTriFactorT->csrMat->row_offsets->data().get(),
1553                         loTriFactorT->csrMat->column_indices->data().get(),
1554                         loTriFactorT->solveInfo,
1555                         tempGPU->data().get(), xarray
1556                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558                       #endif
1559 );CHKERRCUSPARSE(stat);
1560 
1561   /* restore */
1562   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1563   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1564   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1566   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1571 {
1572   const PetscScalar                     *barray;
1573   PetscScalar                           *xarray;
1574   thrust::device_ptr<const PetscScalar> bGPU;
1575   thrust::device_ptr<PetscScalar>       xGPU;
1576   cusparseStatus_t                      stat;
1577   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581   PetscErrorCode                        ierr;
1582   cudaError_t                           cerr;
1583 
1584   PetscFunctionBegin;
1585 
1586   /* Get the GPU pointers */
1587   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1588   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1589   xGPU = thrust::device_pointer_cast(xarray);
1590   bGPU = thrust::device_pointer_cast(barray);
1591 
1592   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1593   /* First, reorder with the row permutation */
1594   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1596                tempGPU->begin());
1597 
1598   /* Next, solve L */
1599   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600                         loTriFactor->csrMat->num_rows,
1601                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602                         loTriFactor->csrMat->num_entries,
1603                       #endif
1604                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605                         loTriFactor->csrMat->values->data().get(),
1606                         loTriFactor->csrMat->row_offsets->data().get(),
1607                         loTriFactor->csrMat->column_indices->data().get(),
1608                         loTriFactor->solveInfo,
1609                         tempGPU->data().get(), xarray
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612                       #endif
1613 );CHKERRCUSPARSE(stat);
1614 
1615   /* Then, solve U */
1616   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617                         upTriFactor->csrMat->num_rows,
1618                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619                         upTriFactor->csrMat->num_entries,
1620                       #endif
1621                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622                         upTriFactor->csrMat->values->data().get(),
1623                         upTriFactor->csrMat->row_offsets->data().get(),
1624                         upTriFactor->csrMat->column_indices->data().get(),
1625                         upTriFactor->solveInfo,
1626                         xarray, tempGPU->data().get()
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629                       #endif
1630 );CHKERRCUSPARSE(stat);
1631 
1632   /* Last, reorder with the column permutation */
1633   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1634                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1635                xGPU);
1636 
1637   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1638   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1639   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1641   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1646 {
1647   const PetscScalar                 *barray;
1648   PetscScalar                       *xarray;
1649   cusparseStatus_t                  stat;
1650   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654   PetscErrorCode                    ierr;
1655   cudaError_t                       cerr;
1656 
1657   PetscFunctionBegin;
1658   /* Get the GPU pointers */
1659   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1660   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1661 
1662   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1663   /* First, solve L */
1664   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665                         loTriFactor->csrMat->num_rows,
1666                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667                         loTriFactor->csrMat->num_entries,
1668                       #endif
1669                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670                         loTriFactor->csrMat->values->data().get(),
1671                         loTriFactor->csrMat->row_offsets->data().get(),
1672                         loTriFactor->csrMat->column_indices->data().get(),
1673                         loTriFactor->solveInfo,
1674                         barray, tempGPU->data().get()
1675                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677                       #endif
1678 );CHKERRCUSPARSE(stat);
1679 
1680   /* Next, solve U */
1681   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682                         upTriFactor->csrMat->num_rows,
1683                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684                         upTriFactor->csrMat->num_entries,
1685                       #endif
1686                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687                         upTriFactor->csrMat->values->data().get(),
1688                         upTriFactor->csrMat->row_offsets->data().get(),
1689                         upTriFactor->csrMat->column_indices->data().get(),
1690                         upTriFactor->solveInfo,
1691                         tempGPU->data().get(), xarray
1692                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694                       #endif
1695 );CHKERRCUSPARSE(stat);
1696 
1697   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1698   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1699   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1701   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1706 {
1707   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1708   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1709   cudaError_t        cerr;
1710   PetscErrorCode     ierr;
1711 
1712   PetscFunctionBegin;
1713   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1714     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1715 
1716     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1717     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1718     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1719     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1720     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1721     A->offloadmask = PETSC_OFFLOAD_BOTH;
1722   }
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1727 {
1728   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1729   PetscErrorCode ierr;
1730 
1731   PetscFunctionBegin;
1732   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1733   *array = a->a;
1734   A->offloadmask = PETSC_OFFLOAD_CPU;
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1739 {
1740   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1741   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1742   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1743   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1744   PetscErrorCode               ierr;
1745   cusparseStatus_t             stat;
1746   PetscBool                    both = PETSC_TRUE;
1747   cudaError_t                  err;
1748 
1749   PetscFunctionBegin;
1750   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753       CsrMatrix *matrix;
1754       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1755 
1756       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1757       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1758       matrix->values->assign(a->a, a->a+a->nz);
1759       err  = WaitForCUDA();CHKERRCUDA(err);
1760       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1761       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1762       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1763     } else {
1764       PetscInt nnz;
1765       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1766       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1767       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1768       delete cusparsestruct->workVector;
1769       delete cusparsestruct->rowoffsets_gpu;
1770       cusparsestruct->workVector = NULL;
1771       cusparsestruct->rowoffsets_gpu = NULL;
1772       try {
1773         if (a->compressedrow.use) {
1774           m    = a->compressedrow.nrows;
1775           ii   = a->compressedrow.i;
1776           ridx = a->compressedrow.rindex;
1777         } else {
1778           m    = A->rmap->n;
1779           ii   = a->i;
1780           ridx = NULL;
1781         }
1782         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785         else nnz = a->nz;
1786 
1787         /* create cusparse matrix */
1788         cusparsestruct->nrows = m;
1789         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1790         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1791         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1792         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1793 
1794         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1795         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1796         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1798         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1799         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1800         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1801 
1802         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804           /* set the matrix */
1805           CsrMatrix *mat= new CsrMatrix;
1806           mat->num_rows = m;
1807           mat->num_cols = A->cmap->n;
1808           mat->num_entries = nnz;
1809           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810           mat->row_offsets->assign(ii, ii + m+1);
1811 
1812           mat->column_indices = new THRUSTINTARRAY32(nnz);
1813           mat->column_indices->assign(a->j, a->j+nnz);
1814 
1815           mat->values = new THRUSTARRAY(nnz);
1816           if (a->a) mat->values->assign(a->a, a->a+nnz);
1817 
1818           /* assign the pointer */
1819           matstruct->mat = mat;
1820          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822             stat = cusparseCreateCsr(&matstruct->matDescr,
1823                                     mat->num_rows, mat->num_cols, mat->num_entries,
1824                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825                                     mat->values->data().get(),
1826                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828           }
1829          #endif
1830         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833          #else
1834           CsrMatrix *mat= new CsrMatrix;
1835           mat->num_rows = m;
1836           mat->num_cols = A->cmap->n;
1837           mat->num_entries = nnz;
1838           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839           mat->row_offsets->assign(ii, ii + m+1);
1840 
1841           mat->column_indices = new THRUSTINTARRAY32(nnz);
1842           mat->column_indices->assign(a->j, a->j+nnz);
1843 
1844           mat->values = new THRUSTARRAY(nnz);
1845           if (a->a) mat->values->assign(a->a, a->a+nnz);
1846 
1847           cusparseHybMat_t hybMat;
1848           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852               matstruct->descr, mat->values->data().get(),
1853               mat->row_offsets->data().get(),
1854               mat->column_indices->data().get(),
1855               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856           /* assign the pointer */
1857           matstruct->mat = hybMat;
1858 
1859           if (mat) {
1860             if (mat->values) delete (THRUSTARRAY*)mat->values;
1861             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863             delete (CsrMatrix*)mat;
1864           }
1865          #endif
1866         }
1867 
1868         /* assign the compressed row indices */
1869         if (a->compressedrow.use) {
1870           cusparsestruct->workVector = new THRUSTARRAY(m);
1871           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1872           matstruct->cprowIndices->assign(ridx,ridx+m);
1873           tmp = m;
1874         } else {
1875           cusparsestruct->workVector = NULL;
1876           matstruct->cprowIndices    = NULL;
1877           tmp = 0;
1878         }
1879         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1880 
1881         /* assign the pointer */
1882         cusparsestruct->mat = matstruct;
1883       } catch(char *ex) {
1884         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1885       }
1886       err  = WaitForCUDA();CHKERRCUDA(err);
1887       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888       cusparsestruct->nonzerostate = A->nonzerostate;
1889     }
1890     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1891   }
1892   PetscFunctionReturn(0);
1893 }
1894 
1895 struct VecCUDAPlusEquals
1896 {
1897   template <typename Tuple>
1898   __host__ __device__
1899   void operator()(Tuple t)
1900   {
1901     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902   }
1903 };
1904 
1905 struct VecCUDAEquals
1906 {
1907   template <typename Tuple>
1908   __host__ __device__
1909   void operator()(Tuple t)
1910   {
1911     thrust::get<1>(t) = thrust::get<0>(t);
1912   }
1913 };
1914 
1915 struct VecCUDAEqualsReverse
1916 {
1917   template <typename Tuple>
1918   __host__ __device__
1919   void operator()(Tuple t)
1920   {
1921     thrust::get<0>(t) = thrust::get<1>(t);
1922   }
1923 };
1924 
1925 struct MatMatCusparse {
1926   PetscBool             cisdense;
1927   PetscScalar           *Bt;
1928   Mat                   X;
1929   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930   PetscLogDouble        flops;
1931   CsrMatrix             *Bcsr;
1932 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933   cusparseSpMatDescr_t  matSpBDescr;
1934   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1935   cusparseDnMatDescr_t  matBDescr;
1936   cusparseDnMatDescr_t  matCDescr;
1937   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938   size_t                mmBufferSize;
1939   void                  *mmBuffer;
1940   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941   cusparseSpGEMMDescr_t spgemmDesc;
1942 #endif
1943 };
1944 
1945 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946 {
1947   PetscErrorCode   ierr;
1948   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1949   cudaError_t      cerr;
1950  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951   cusparseStatus_t stat;
1952  #endif
1953 
1954   PetscFunctionBegin;
1955   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956   delete mmdata->Bcsr;
1957  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964  #endif
1965   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1966   ierr = PetscFree(data);CHKERRQ(ierr);
1967   PetscFunctionReturn(0);
1968 }
1969 
1970 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1971 
1972 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973 {
1974   Mat_Product                  *product = C->product;
1975   Mat                          A,B;
1976   PetscInt                     m,n,blda,clda;
1977   PetscBool                    flg,biscuda;
1978   Mat_SeqAIJCUSPARSE           *cusp;
1979   cusparseStatus_t             stat;
1980   cusparseOperation_t          opA;
1981   const PetscScalar            *barray;
1982   PetscScalar                  *carray;
1983   PetscErrorCode               ierr;
1984   MatMatCusparse               *mmdata;
1985   Mat_SeqAIJCUSPARSEMultStruct *mat;
1986   CsrMatrix                    *csrmat;
1987   cudaError_t                  cerr;
1988 
1989   PetscFunctionBegin;
1990   MatCheckProduct(C,1);
1991   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992   mmdata = (MatMatCusparse*)product->data;
1993   A    = product->A;
1994   B    = product->B;
1995   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1996   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998      Instead of silently accepting the wrong answer, I prefer to raise the error */
1999   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2001   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002   switch (product->type) {
2003   case MATPRODUCT_AB:
2004   case MATPRODUCT_PtAP:
2005     mat = cusp->mat;
2006     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007     m   = A->rmap->n;
2008     n   = B->cmap->n;
2009     break;
2010   case MATPRODUCT_AtB:
2011     if (!A->form_explicit_transpose) {
2012       mat = cusp->mat;
2013       opA = CUSPARSE_OPERATION_TRANSPOSE;
2014     } else {
2015       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2016       mat  = cusp->matTranspose;
2017       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018     }
2019     m = A->cmap->n;
2020     n = B->cmap->n;
2021     break;
2022   case MATPRODUCT_ABt:
2023   case MATPRODUCT_RARt:
2024     mat = cusp->mat;
2025     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026     m   = A->rmap->n;
2027     n   = B->rmap->n;
2028     break;
2029   default:
2030     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031   }
2032   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033   csrmat = (CsrMatrix*)mat->mat;
2034   /* if the user passed a CPU matrix, copy the data to the GPU */
2035   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2036   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2037   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2038 
2039   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2040   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2042     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2043   } else {
2044     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2045     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2046   }
2047 
2048   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2049  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053     size_t mmBufferSize;
2054     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055     if (!mmdata->matBDescr) {
2056       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057       mmdata->Blda = blda;
2058     }
2059 
2060     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063       mmdata->Clda = clda;
2064     }
2065 
2066     if (!mat->matDescr) {
2067       stat = cusparseCreateCsr(&mat->matDescr,
2068                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070                                csrmat->values->data().get(),
2071                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073     }
2074     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076                                    mmdata->matCDescr,cusparse_scalartype,
2077                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081       mmdata->mmBufferSize = mmBufferSize;
2082     }
2083     mmdata->initialized = PETSC_TRUE;
2084   } else {
2085     /* to be safe, always update pointers of the mats */
2086     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089   }
2090 
2091   /* do cusparseSpMM, which supports transpose on B */
2092   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094                       mmdata->matCDescr,cusparse_scalartype,
2095                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096  #else
2097   PetscInt k;
2098   /* cusparseXcsrmm does not support transpose on B */
2099   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100     cublasHandle_t cublasv2handle;
2101     cublasStatus_t cerr;
2102 
2103     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2104     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105                        B->cmap->n,B->rmap->n,
2106                        &PETSC_CUSPARSE_ONE ,barray,blda,
2107                        &PETSC_CUSPARSE_ZERO,barray,blda,
2108                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109     blda = B->cmap->n;
2110     k    = B->cmap->n;
2111   } else {
2112     k    = B->rmap->n;
2113   }
2114 
2115   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117                            csrmat->num_entries,mat->alpha_one,mat->descr,
2118                            csrmat->values->data().get(),
2119                            csrmat->row_offsets->data().get(),
2120                            csrmat->column_indices->data().get(),
2121                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122                            carray,clda);CHKERRCUSPARSE(stat);
2123  #endif
2124   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2126   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2127   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2128   if (product->type == MATPRODUCT_RARt) {
2129     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2130     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2131   } else if (product->type == MATPRODUCT_PtAP) {
2132     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2133     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2134   } else {
2135     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2136   }
2137   if (mmdata->cisdense) {
2138     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2139   }
2140   if (!biscuda) {
2141     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2142   }
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147 {
2148   Mat_Product        *product = C->product;
2149   Mat                A,B;
2150   PetscInt           m,n;
2151   PetscBool          cisdense,flg;
2152   PetscErrorCode     ierr;
2153   MatMatCusparse     *mmdata;
2154   Mat_SeqAIJCUSPARSE *cusp;
2155 
2156   PetscFunctionBegin;
2157   MatCheckProduct(C,1);
2158   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159   A    = product->A;
2160   B    = product->B;
2161   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2162   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165   switch (product->type) {
2166   case MATPRODUCT_AB:
2167     m = A->rmap->n;
2168     n = B->cmap->n;
2169     break;
2170   case MATPRODUCT_AtB:
2171     m = A->cmap->n;
2172     n = B->cmap->n;
2173     break;
2174   case MATPRODUCT_ABt:
2175     m = A->rmap->n;
2176     n = B->rmap->n;
2177     break;
2178   case MATPRODUCT_PtAP:
2179     m = B->cmap->n;
2180     n = B->cmap->n;
2181     break;
2182   case MATPRODUCT_RARt:
2183     m = B->rmap->n;
2184     n = B->rmap->n;
2185     break;
2186   default:
2187     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188   }
2189   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2190   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2192   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2193 
2194   /* product data */
2195   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2196   mmdata->cisdense = cisdense;
2197  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201   }
2202  #endif
2203   /* for these products we need intermediate storage */
2204   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2206     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2207     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2209     } else {
2210       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2211     }
2212   }
2213   C->product->data    = mmdata;
2214   C->product->destroy = MatDestroy_MatMatCusparse;
2215 
2216   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217   PetscFunctionReturn(0);
2218 }
2219 
2220 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221 {
2222   Mat_Product                  *product = C->product;
2223   Mat                          A,B;
2224   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2225   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2226   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2228   PetscBool                    flg;
2229   PetscErrorCode               ierr;
2230   cusparseStatus_t             stat;
2231   cudaError_t                  cerr;
2232   MatProductType               ptype;
2233   MatMatCusparse               *mmdata;
2234 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235   cusparseSpMatDescr_t         BmatSpDescr;
2236 #endif
2237 
2238   PetscFunctionBegin;
2239   MatCheckProduct(C,1);
2240   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2242   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243   mmdata = (MatMatCusparse*)C->product->data;
2244   A = product->A;
2245   B = product->B;
2246   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247     mmdata->reusesym = PETSC_FALSE;
2248     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250     Cmat = Ccusp->mat;
2251     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252     Ccsr = (CsrMatrix*)Cmat->mat;
2253     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254     goto finalize;
2255   }
2256   if (!c->nz) goto finalize;
2257   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2258   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2260   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2270   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2271 
2272   ptype = product->type;
2273   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275   switch (ptype) {
2276   case MATPRODUCT_AB:
2277     Amat = Acusp->mat;
2278     Bmat = Bcusp->mat;
2279     break;
2280   case MATPRODUCT_AtB:
2281     Amat = Acusp->matTranspose;
2282     Bmat = Bcusp->mat;
2283     break;
2284   case MATPRODUCT_ABt:
2285     Amat = Acusp->mat;
2286     Bmat = Bcusp->matTranspose;
2287     break;
2288   default:
2289     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290   }
2291   Cmat = Ccusp->mat;
2292   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295   Acsr = (CsrMatrix*)Amat->mat;
2296   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297   Ccsr = (CsrMatrix*)Cmat->mat;
2298   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2302 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311 #else
2312   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317 #endif
2318   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2319   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2321   C->offloadmask = PETSC_OFFLOAD_GPU;
2322 finalize:
2323   /* shorter version of MatAssemblyEnd_SeqAIJ */
2324   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2325   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2326   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2327   c->reallocs         = 0;
2328   C->info.mallocs    += 0;
2329   C->info.nz_unneeded = 0;
2330   C->assembled = C->was_assembled = PETSC_TRUE;
2331   C->num_ass++;
2332   PetscFunctionReturn(0);
2333 }
2334 
2335 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336 {
2337   Mat_Product                  *product = C->product;
2338   Mat                          A,B;
2339   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2340   Mat_SeqAIJ                   *a,*b,*c;
2341   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2343   PetscInt                     i,j,m,n,k;
2344   PetscBool                    flg;
2345   PetscErrorCode               ierr;
2346   cusparseStatus_t             stat;
2347   cudaError_t                  cerr;
2348   MatProductType               ptype;
2349   MatMatCusparse               *mmdata;
2350   PetscLogDouble               flops;
2351   PetscBool                    biscompressed,ciscompressed;
2352 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2354   size_t                       bufSize2;
2355   cusparseSpMatDescr_t         BmatSpDescr;
2356 #else
2357   int                          cnz;
2358 #endif
2359 
2360   PetscFunctionBegin;
2361   MatCheckProduct(C,1);
2362   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363   A    = product->A;
2364   B    = product->B;
2365   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2366   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2368   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369   a = (Mat_SeqAIJ*)A->data;
2370   b = (Mat_SeqAIJ*)B->data;
2371   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2375 
2376   /* product data */
2377   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2378   C->product->data    = mmdata;
2379   C->product->destroy = MatDestroy_MatMatCusparse;
2380 
2381   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2382   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2383   ptype = product->type;
2384   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386   biscompressed = PETSC_FALSE;
2387   ciscompressed = PETSC_FALSE;
2388   switch (ptype) {
2389   case MATPRODUCT_AB:
2390     m = A->rmap->n;
2391     n = B->cmap->n;
2392     k = A->cmap->n;
2393     Amat = Acusp->mat;
2394     Bmat = Bcusp->mat;
2395     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397     break;
2398   case MATPRODUCT_AtB:
2399     m = A->cmap->n;
2400     n = B->cmap->n;
2401     k = A->rmap->n;
2402     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2403     Amat = Acusp->matTranspose;
2404     Bmat = Bcusp->mat;
2405     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406     break;
2407   case MATPRODUCT_ABt:
2408     m = A->rmap->n;
2409     n = B->rmap->n;
2410     k = A->cmap->n;
2411     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2412     Amat = Acusp->mat;
2413     Bmat = Bcusp->matTranspose;
2414     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415     break;
2416   default:
2417     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418   }
2419 
2420   /* create cusparse matrix */
2421   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2422   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2423   c     = (Mat_SeqAIJ*)C->data;
2424   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2426   Ccsr  = new CsrMatrix;
2427 
2428   c->compressedrow.use = ciscompressed;
2429   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430     c->compressedrow.nrows = a->compressedrow.nrows;
2431     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2432     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2433     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2434     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436   } else {
2437     c->compressedrow.nrows  = 0;
2438     c->compressedrow.i      = NULL;
2439     c->compressedrow.rindex = NULL;
2440     Ccusp->workVector       = NULL;
2441     Cmat->cprowIndices      = NULL;
2442   }
2443   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2444   Ccusp->mat      = Cmat;
2445   Ccusp->mat->mat = Ccsr;
2446   Ccsr->num_rows    = Ccusp->nrows;
2447   Ccsr->num_cols    = n;
2448   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460     c->nz = 0;
2461     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462     Ccsr->values = new THRUSTARRAY(c->nz);
2463     goto finalizesym;
2464   }
2465 
2466   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468   Acsr = (CsrMatrix*)Amat->mat;
2469   if (!biscompressed) {
2470     Bcsr = (CsrMatrix*)Bmat->mat;
2471 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472     BmatSpDescr = Bmat->matDescr;
2473 #endif
2474   } else { /* we need to use row offsets for the full matrix */
2475     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476     Bcsr = new CsrMatrix;
2477     Bcsr->num_rows       = B->rmap->n;
2478     Bcsr->num_cols       = cBcsr->num_cols;
2479     Bcsr->num_entries    = cBcsr->num_entries;
2480     Bcsr->column_indices = cBcsr->column_indices;
2481     Bcsr->values         = cBcsr->values;
2482     if (!Bcusp->rowoffsets_gpu) {
2483       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2484       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2486     }
2487     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488     mmdata->Bcsr = Bcsr;
2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490     if (Bcsr->num_rows && Bcsr->num_cols) {
2491       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493                                Bcsr->values->data().get(),
2494                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496     }
2497     BmatSpDescr = mmdata->matSpBDescr;
2498 #endif
2499   }
2500   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502   /* precompute flops count */
2503   if (ptype == MATPRODUCT_AB) {
2504     for (i=0, flops = 0; i<A->rmap->n; i++) {
2505       const PetscInt st = a->i[i];
2506       const PetscInt en = a->i[i+1];
2507       for (j=st; j<en; j++) {
2508         const PetscInt brow = a->j[j];
2509         flops += 2.*(b->i[brow+1] - b->i[brow]);
2510       }
2511     }
2512   } else if (ptype == MATPRODUCT_AtB) {
2513     for (i=0, flops = 0; i<A->rmap->n; i++) {
2514       const PetscInt anzi = a->i[i+1] - a->i[i];
2515       const PetscInt bnzi = b->i[i+1] - b->i[i];
2516       flops += (2.*anzi)*bnzi;
2517     }
2518   } else { /* TODO */
2519     flops = 0.;
2520   }
2521 
2522   mmdata->flops = flops;
2523   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527                            NULL, NULL, NULL,
2528                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531   /* ask bufferSize bytes for external memory */
2532   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537   /* inspect the matrices A and B to understand the memory requirement for the next step */
2538   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542   /* ask bufferSize again bytes for external memory */
2543   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547   /* The CUSPARSE documentation is not clear, nor the API
2548      We need both buffers to perform the operations properly!
2549      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551      is stored in the descriptor! What a messy API... */
2552   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553   /* compute the intermediate product of A * B */
2554   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558   /* get matrix C non-zero entries C_nnz1 */
2559   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560   c->nz = (PetscInt) C_nnz1;
2561   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2562   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564   Ccsr->values = new THRUSTARRAY(c->nz);
2565   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571 #else
2572   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578   c->nz = cnz;
2579   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581   Ccsr->values = new THRUSTARRAY(c->nz);
2582   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2583 
2584   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593 #endif
2594   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2596   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2597 finalizesym:
2598   c->singlemalloc = PETSC_FALSE;
2599   c->free_a       = PETSC_TRUE;
2600   c->free_ij      = PETSC_TRUE;
2601   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2602   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2603   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604     PetscInt *d_i = c->i;
2605     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607     ii   = *Ccsr->row_offsets;
2608     jj   = *Ccsr->column_indices;
2609     if (ciscompressed) d_i = c->compressedrow.i;
2610     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612   } else {
2613     PetscInt *d_i = c->i;
2614     if (ciscompressed) d_i = c->compressedrow.i;
2615     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617   }
2618   if (ciscompressed) { /* need to expand host row offsets */
2619     PetscInt r = 0;
2620     c->i[0] = 0;
2621     for (k = 0; k < c->compressedrow.nrows; k++) {
2622       const PetscInt next = c->compressedrow.rindex[k];
2623       const PetscInt old = c->compressedrow.i[k];
2624       for (; r < next; r++) c->i[r+1] = old;
2625     }
2626     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627   }
2628   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2629   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2630   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2631   c->maxnz = c->nz;
2632   c->nonzerorowcnt = 0;
2633   c->rmax = 0;
2634   for (k = 0; k < m; k++) {
2635     const PetscInt nn = c->i[k+1] - c->i[k];
2636     c->ilen[k] = c->imax[k] = nn;
2637     c->nonzerorowcnt += (PetscInt)!!nn;
2638     c->rmax = PetscMax(c->rmax,nn);
2639   }
2640   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2641   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2642   Ccsr->num_entries = c->nz;
2643 
2644   C->nonzerostate++;
2645   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2646   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2647   Ccusp->nonzerostate = C->nonzerostate;
2648   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2649   C->preallocated  = PETSC_TRUE;
2650   C->assembled     = PETSC_FALSE;
2651   C->was_assembled = PETSC_FALSE;
2652   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653     mmdata->reusesym = PETSC_TRUE;
2654     C->offloadmask   = PETSC_OFFLOAD_GPU;
2655   }
2656   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657   PetscFunctionReturn(0);
2658 }
2659 
2660 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2661 
2662 /* handles sparse or dense B */
2663 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664 {
2665   Mat_Product    *product = mat->product;
2666   PetscErrorCode ierr;
2667   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2668 
2669   PetscFunctionBegin;
2670   MatCheckProduct(mat,1);
2671   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2672   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2674   }
2675   if (product->type == MATPRODUCT_ABC) {
2676     Ciscusp = PETSC_FALSE;
2677     if (!product->C->boundtocpu) {
2678       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2679     }
2680   }
2681   if (isdense) {
2682     switch (product->type) {
2683     case MATPRODUCT_AB:
2684     case MATPRODUCT_AtB:
2685     case MATPRODUCT_ABt:
2686     case MATPRODUCT_PtAP:
2687     case MATPRODUCT_RARt:
2688      if (product->A->boundtocpu) {
2689         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2690       } else {
2691         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2692       }
2693       break;
2694     case MATPRODUCT_ABC:
2695       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2696       break;
2697     default:
2698       break;
2699     }
2700   } else if (Biscusp && Ciscusp) {
2701     switch (product->type) {
2702     case MATPRODUCT_AB:
2703     case MATPRODUCT_AtB:
2704     case MATPRODUCT_ABt:
2705       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2706       break;
2707     case MATPRODUCT_PtAP:
2708     case MATPRODUCT_RARt:
2709     case MATPRODUCT_ABC:
2710       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2711       break;
2712     default:
2713       break;
2714     }
2715   } else { /* fallback for AIJ */
2716     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2717   }
2718   PetscFunctionReturn(0);
2719 }
2720 
2721 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2722 {
2723   PetscErrorCode ierr;
2724 
2725   PetscFunctionBegin;
2726   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2727   PetscFunctionReturn(0);
2728 }
2729 
2730 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2731 {
2732   PetscErrorCode ierr;
2733 
2734   PetscFunctionBegin;
2735   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2736   PetscFunctionReturn(0);
2737 }
2738 
2739 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2740 {
2741   PetscErrorCode ierr;
2742 
2743   PetscFunctionBegin;
2744   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2745   PetscFunctionReturn(0);
2746 }
2747 
2748 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2749 {
2750   PetscErrorCode ierr;
2751 
2752   PetscFunctionBegin;
2753   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2758 {
2759   PetscErrorCode ierr;
2760 
2761   PetscFunctionBegin;
2762   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2763   PetscFunctionReturn(0);
2764 }
2765 
2766 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2767 {
2768   int i = blockIdx.x*blockDim.x + threadIdx.x;
2769   if (i < n) y[idx[i]] += x[i];
2770 }
2771 
2772 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2773 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2774 {
2775   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2776   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2777   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2778   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2779   PetscErrorCode               ierr;
2780   cudaError_t                  cerr;
2781   cusparseStatus_t             stat;
2782   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2783   PetscBool                    compressed;
2784 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2785   PetscInt                     nx,ny;
2786 #endif
2787 
2788   PetscFunctionBegin;
2789   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2790   if (!a->nonzerorowcnt) {
2791     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2792     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2793     PetscFunctionReturn(0);
2794   }
2795   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2796   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2797   if (!trans) {
2798     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2799     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2800   } else {
2801     if (herm || !A->form_explicit_transpose) {
2802       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2803       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2804     } else {
2805       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2806       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2807     }
2808   }
2809   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2810   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2811 
2812   try {
2813     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2814     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2815     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2816 
2817     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2818     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2819       /* z = A x + beta y.
2820          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2821          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2822       */
2823       xptr = xarray;
2824       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2825       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2826      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2827       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2828           allocated to accommodate different uses. So we get the length info directly from mat.
2829        */
2830       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2831         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2832         nx = mat->num_cols;
2833         ny = mat->num_rows;
2834       }
2835      #endif
2836     } else {
2837       /* z = A^T x + beta y
2838          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2839          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2840        */
2841       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2842       dptr = zarray;
2843       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2844       if (compressed) { /* Scatter x to work vector */
2845         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2846         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2847                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2848                          VecCUDAEqualsReverse());
2849       }
2850      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2851       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2852         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2853         nx = mat->num_rows;
2854         ny = mat->num_cols;
2855       }
2856      #endif
2857     }
2858 
2859     /* csr_spmv does y = alpha op(A) x + beta y */
2860     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2861      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2862       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2863       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2864         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2865         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2866         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2867                                 matstruct->matDescr,
2868                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2869                                 matstruct->cuSpMV[opA].vecYDescr,
2870                                 cusparse_scalartype,
2871                                 cusparsestruct->spmvAlg,
2872                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2873         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2874 
2875         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2876       } else {
2877         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2878         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2879         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2880       }
2881 
2882       stat = cusparseSpMV(cusparsestruct->handle, opA,
2883                                matstruct->alpha_one,
2884                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2885                                matstruct->cuSpMV[opA].vecXDescr,
2886                                beta,
2887                                matstruct->cuSpMV[opA].vecYDescr,
2888                                cusparse_scalartype,
2889                                cusparsestruct->spmvAlg,
2890                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2891      #else
2892       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2893       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2894                                mat->num_rows, mat->num_cols,
2895                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2896                                mat->values->data().get(), mat->row_offsets->data().get(),
2897                                mat->column_indices->data().get(), xptr, beta,
2898                                dptr);CHKERRCUSPARSE(stat);
2899      #endif
2900     } else {
2901       if (cusparsestruct->nrows) {
2902        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2903         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2904        #else
2905         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2906         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2907                                  matstruct->alpha_one, matstruct->descr, hybMat,
2908                                  xptr, beta,
2909                                  dptr);CHKERRCUSPARSE(stat);
2910        #endif
2911       }
2912     }
2913     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2914     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2915 
2916     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2917       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2918         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2919           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2920         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2921           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2922         }
2923       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2924         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2925       }
2926 
2927       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2928       if (compressed) {
2929         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2930         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2931            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2932            prevent that. So I just add a ScatterAdd kernel.
2933          */
2934        #if 0
2935         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2936         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2937                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2938                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2939                          VecCUDAPlusEquals());
2940        #else
2941         PetscInt n = matstruct->cprowIndices->size();
2942         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2943        #endif
2944         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2945         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2946       }
2947     } else {
2948       if (yy && yy != zz) {
2949         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2950       }
2951     }
2952     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2953     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2954     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2955   } catch(char *ex) {
2956     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2957   }
2958   if (yy) {
2959     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2960   } else {
2961     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2962   }
2963   PetscFunctionReturn(0);
2964 }
2965 
2966 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2967 {
2968   PetscErrorCode ierr;
2969 
2970   PetscFunctionBegin;
2971   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2972   PetscFunctionReturn(0);
2973 }
2974 
2975 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2976 {
2977   PetscErrorCode              ierr;
2978   PetscSplitCSRDataStructure  *d_mat = NULL;
2979   PetscFunctionBegin;
2980   if (A->factortype == MAT_FACTOR_NONE) {
2981     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2982   }
2983   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2984   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2985   if (d_mat) {
2986     A->offloadmask = PETSC_OFFLOAD_GPU;
2987   }
2988 
2989   PetscFunctionReturn(0);
2990 }
2991 
2992 /* --------------------------------------------------------------------------------*/
2993 /*@
2994    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2995    (the default parallel PETSc format). This matrix will ultimately pushed down
2996    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2997    assembly performance the user should preallocate the matrix storage by setting
2998    the parameter nz (or the array nnz).  By setting these parameters accurately,
2999    performance during matrix assembly can be increased by more than a factor of 50.
3000 
3001    Collective
3002 
3003    Input Parameters:
3004 +  comm - MPI communicator, set to PETSC_COMM_SELF
3005 .  m - number of rows
3006 .  n - number of columns
3007 .  nz - number of nonzeros per row (same for all rows)
3008 -  nnz - array containing the number of nonzeros in the various rows
3009          (possibly different for each row) or NULL
3010 
3011    Output Parameter:
3012 .  A - the matrix
3013 
3014    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3015    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3016    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3017 
3018    Notes:
3019    If nnz is given then nz is ignored
3020 
3021    The AIJ format (also called the Yale sparse matrix format or
3022    compressed row storage), is fully compatible with standard Fortran 77
3023    storage.  That is, the stored row and column indices can begin at
3024    either one (as in Fortran) or zero.  See the users' manual for details.
3025 
3026    Specify the preallocated storage with either nz or nnz (not both).
3027    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3028    allocation.  For large problems you MUST preallocate memory or you
3029    will get TERRIBLE performance, see the users' manual chapter on matrices.
3030 
3031    By default, this format uses inodes (identical nodes) when possible, to
3032    improve numerical efficiency of matrix-vector products and solves. We
3033    search for consecutive rows with the same nonzero structure, thereby
3034    reusing matrix information to achieve increased efficiency.
3035 
3036    Level: intermediate
3037 
3038 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3039 @*/
3040 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3041 {
3042   PetscErrorCode ierr;
3043 
3044   PetscFunctionBegin;
3045   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3046   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3047   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3048   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3049   PetscFunctionReturn(0);
3050 }
3051 
3052 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3053 {
3054   PetscErrorCode              ierr;
3055   PetscSplitCSRDataStructure  *d_mat = NULL;
3056 
3057   PetscFunctionBegin;
3058   if (A->factortype == MAT_FACTOR_NONE) {
3059     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3060     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3061     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3062   } else {
3063     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3064   }
3065   if (d_mat) {
3066     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3067     cudaError_t                err;
3068     PetscSplitCSRDataStructure h_mat;
3069     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3070     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3071     if (a->compressedrow.use) {
3072       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3073     }
3074     err = cudaFree(d_mat);CHKERRCUDA(err);
3075   }
3076   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3077   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3078   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3079   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3080   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3081   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3082   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3083   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3084   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3085   PetscFunctionReturn(0);
3086 }
3087 
3088 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3089 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3090 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3091 {
3092   PetscErrorCode ierr;
3093 
3094   PetscFunctionBegin;
3095   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3096   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3097   PetscFunctionReturn(0);
3098 }
3099 
3100 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3101 {
3102   PetscErrorCode     ierr;
3103   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3104   Mat_SeqAIJCUSPARSE *cy;
3105   Mat_SeqAIJCUSPARSE *cx;
3106   PetscScalar        *ay;
3107   const PetscScalar  *ax;
3108   CsrMatrix          *csry,*csrx;
3109   cudaError_t        cerr;
3110 
3111   PetscFunctionBegin;
3112   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3113   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3114   if (X->ops->axpy != Y->ops->axpy) {
3115     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3116     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3117     PetscFunctionReturn(0);
3118   }
3119   /* if we are here, it means both matrices are bound to GPU */
3120   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3121   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3122   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3123   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3124   csry = (CsrMatrix*)cy->mat->mat;
3125   csrx = (CsrMatrix*)cx->mat->mat;
3126   /* see if we can turn this into a cublas axpy */
3127   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3128     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3129     if (eq) {
3130       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3131     }
3132     if (eq) str = SAME_NONZERO_PATTERN;
3133   }
3134   /* spgeam is buggy with one column */
3135   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3136 
3137   if (str == SUBSET_NONZERO_PATTERN) {
3138     cusparseStatus_t stat;
3139     PetscScalar      b = 1.0;
3140 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3141     size_t           bufferSize;
3142     void             *buffer;
3143 #endif
3144 
3145     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3146     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3147     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3148 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3149     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3150                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3151                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3152                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3153     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3154     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3155     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3156                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3157                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3158                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3159     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3160     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3161     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3162     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3163 #else
3164     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3165     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3166                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3167                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3168                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3169     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3170     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3171     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3172 #endif
3173     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3174     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3175     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3176     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3177   } else if (str == SAME_NONZERO_PATTERN) {
3178     cublasHandle_t cublasv2handle;
3179     cublasStatus_t berr;
3180     PetscBLASInt   one = 1, bnz = 1;
3181 
3182     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3183     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3184     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3185     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3186     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3187     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3188     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3189     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3190     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3191     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3192     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3193     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3194   } else {
3195     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3196     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3197   }
3198   PetscFunctionReturn(0);
3199 }
3200 
3201 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3202 {
3203   PetscErrorCode ierr;
3204   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3205   PetscScalar    *ay;
3206   cudaError_t    cerr;
3207   cublasHandle_t cublasv2handle;
3208   cublasStatus_t berr;
3209   PetscBLASInt   one = 1, bnz = 1;
3210 
3211   PetscFunctionBegin;
3212   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3213   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3214   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3215   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3216   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3217   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3218   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3219   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3220   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3221   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3222   PetscFunctionReturn(0);
3223 }
3224 
3225 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3226 {
3227   PetscErrorCode             ierr;
3228   PetscBool                  both = PETSC_FALSE;
3229   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3230 
3231   PetscFunctionBegin;
3232   if (A->factortype == MAT_FACTOR_NONE) {
3233     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3234     if (spptr->mat) {
3235       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3236       if (matrix->values) {
3237         both = PETSC_TRUE;
3238         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3239       }
3240     }
3241     if (spptr->matTranspose) {
3242       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3243       if (matrix->values) {
3244         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3245       }
3246     }
3247   }
3248   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3249   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3250   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3251   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3252   else A->offloadmask = PETSC_OFFLOAD_CPU;
3253 
3254   PetscFunctionReturn(0);
3255 }
3256 
3257 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3258 {
3259   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3260   PetscErrorCode ierr;
3261 
3262   PetscFunctionBegin;
3263   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3264   if (flg) {
3265     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3266 
3267     A->ops->scale                     = MatScale_SeqAIJ;
3268     A->ops->axpy                      = MatAXPY_SeqAIJ;
3269     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3270     A->ops->mult                      = MatMult_SeqAIJ;
3271     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3272     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3273     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3274     A->ops->multhermitiantranspose    = NULL;
3275     A->ops->multhermitiantransposeadd = NULL;
3276     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3277     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3278     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3279     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3280     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3281     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3282     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3283     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3284   } else {
3285     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3286     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3287     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3288     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3289     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3290     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3291     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3292     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3293     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3294     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3295     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3296     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3297     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3298     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3299     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3300     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3301     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3302   }
3303   A->boundtocpu = flg;
3304   a->inode.use = flg;
3305   PetscFunctionReturn(0);
3306 }
3307 
3308 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3309 {
3310   PetscErrorCode   ierr;
3311   cusparseStatus_t stat;
3312   Mat              B;
3313 
3314   PetscFunctionBegin;
3315   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3316   if (reuse == MAT_INITIAL_MATRIX) {
3317     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3318   } else if (reuse == MAT_REUSE_MATRIX) {
3319     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3320   }
3321   B = *newmat;
3322 
3323   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3324   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3325 
3326   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3327     if (B->factortype == MAT_FACTOR_NONE) {
3328       Mat_SeqAIJCUSPARSE *spptr;
3329       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3330       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3331       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3332       spptr->format     = MAT_CUSPARSE_CSR;
3333      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3335       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3336       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3337      #endif
3338       B->spptr = spptr;
3339     } else {
3340       Mat_SeqAIJCUSPARSETriFactors *spptr;
3341 
3342       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3343       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3344       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3345       B->spptr = spptr;
3346     }
3347     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3348   }
3349   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3350   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3351   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3352   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3353   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3354   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3355 
3356   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3357   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3358   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3359   PetscFunctionReturn(0);
3360 }
3361 
3362 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3363 {
3364   PetscErrorCode ierr;
3365 
3366   PetscFunctionBegin;
3367   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3368   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3369   PetscFunctionReturn(0);
3370 }
3371 
3372 /*MC
3373    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3374 
3375    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3376    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3377    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3378 
3379    Options Database Keys:
3380 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3381 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3382 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3383 
3384   Level: beginner
3385 
3386 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3387 M*/
3388 
3389 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3390 
3391 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3392 {
3393   PetscErrorCode ierr;
3394 
3395   PetscFunctionBegin;
3396   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3397   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3398   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3399   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3400   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3401 
3402   PetscFunctionReturn(0);
3403 }
3404 
3405 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3406 {
3407   PetscErrorCode   ierr;
3408   cusparseStatus_t stat;
3409 
3410   PetscFunctionBegin;
3411   if (*cusparsestruct) {
3412     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3413     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3414     delete (*cusparsestruct)->workVector;
3415     delete (*cusparsestruct)->rowoffsets_gpu;
3416     delete (*cusparsestruct)->cooPerm;
3417     delete (*cusparsestruct)->cooPerm_a;
3418     delete (*cusparsestruct)->csr2csc_i;
3419     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3420     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3421   }
3422   PetscFunctionReturn(0);
3423 }
3424 
3425 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3426 {
3427   PetscFunctionBegin;
3428   if (*mat) {
3429     delete (*mat)->values;
3430     delete (*mat)->column_indices;
3431     delete (*mat)->row_offsets;
3432     delete *mat;
3433     *mat = 0;
3434   }
3435   PetscFunctionReturn(0);
3436 }
3437 
3438 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3439 {
3440   cusparseStatus_t stat;
3441   PetscErrorCode   ierr;
3442 
3443   PetscFunctionBegin;
3444   if (*trifactor) {
3445     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3446     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3447     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3448     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3449     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3450    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3451     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3452    #endif
3453     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3454   }
3455   PetscFunctionReturn(0);
3456 }
3457 
3458 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3459 {
3460   CsrMatrix        *mat;
3461   cusparseStatus_t stat;
3462   cudaError_t      err;
3463 
3464   PetscFunctionBegin;
3465   if (*matstruct) {
3466     if ((*matstruct)->mat) {
3467       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3468        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3469         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3470        #else
3471         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3472         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3473        #endif
3474       } else {
3475         mat = (CsrMatrix*)(*matstruct)->mat;
3476         CsrMatrix_Destroy(&mat);
3477       }
3478     }
3479     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3480     delete (*matstruct)->cprowIndices;
3481     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3482     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3483     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3484 
3485    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3486     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3487     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3488     for (int i=0; i<3; i++) {
3489       if (mdata->cuSpMV[i].initialized) {
3490         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3491         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3492         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3493       }
3494     }
3495    #endif
3496     delete *matstruct;
3497     *matstruct = NULL;
3498   }
3499   PetscFunctionReturn(0);
3500 }
3501 
3502 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3503 {
3504   PetscErrorCode ierr;
3505 
3506   PetscFunctionBegin;
3507   if (*trifactors) {
3508     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3509     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3510     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3511     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3512     delete (*trifactors)->rpermIndices;
3513     delete (*trifactors)->cpermIndices;
3514     delete (*trifactors)->workVector;
3515     (*trifactors)->rpermIndices = NULL;
3516     (*trifactors)->cpermIndices = NULL;
3517     (*trifactors)->workVector = NULL;
3518     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3519     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3520   }
3521   PetscFunctionReturn(0);
3522 }
3523 
3524 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3525 {
3526   PetscErrorCode   ierr;
3527   cusparseHandle_t handle;
3528   cusparseStatus_t stat;
3529 
3530   PetscFunctionBegin;
3531   if (*trifactors) {
3532     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3533     if (handle = (*trifactors)->handle) {
3534       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3535     }
3536     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3537   }
3538   PetscFunctionReturn(0);
3539 }
3540 
3541 struct IJCompare
3542 {
3543   __host__ __device__
3544   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3545   {
3546     if (t1.get<0>() < t2.get<0>()) return true;
3547     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3548     return false;
3549   }
3550 };
3551 
3552 struct IJEqual
3553 {
3554   __host__ __device__
3555   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3556   {
3557     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3558     return true;
3559   }
3560 };
3561 
3562 struct IJDiff
3563 {
3564   __host__ __device__
3565   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3566   {
3567     return t1 == t2 ? 0 : 1;
3568   }
3569 };
3570 
3571 struct IJSum
3572 {
3573   __host__ __device__
3574   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3575   {
3576     return t1||t2;
3577   }
3578 };
3579 
3580 #include <thrust/iterator/discard_iterator.h>
3581 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3582 {
3583   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3584   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3585   THRUSTARRAY                           *cooPerm_v = NULL;
3586   thrust::device_ptr<const PetscScalar> d_v;
3587   CsrMatrix                             *matrix;
3588   PetscErrorCode                        ierr;
3589   cudaError_t                           cerr;
3590   PetscInt                              n;
3591 
3592   PetscFunctionBegin;
3593   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3594   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3595   if (!cusp->cooPerm) {
3596     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3597     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3598     PetscFunctionReturn(0);
3599   }
3600   matrix = (CsrMatrix*)cusp->mat->mat;
3601   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3602   if (!v) {
3603     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3604     goto finalize;
3605   }
3606   n = cusp->cooPerm->size();
3607   if (isCudaMem(v)) {
3608     d_v = thrust::device_pointer_cast(v);
3609   } else {
3610     cooPerm_v = new THRUSTARRAY(n);
3611     cooPerm_v->assign(v,v+n);
3612     d_v = cooPerm_v->data();
3613     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3614   }
3615   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3616   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3617     if (cusp->cooPerm_a) {
3618       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3619       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3620       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3621       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3622       delete cooPerm_w;
3623     } else {
3624       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3625                                                                 matrix->values->begin()));
3626       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3627                                                                 matrix->values->end()));
3628       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3629     }
3630   } else {
3631     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3632       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3633       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3634     } else {
3635       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3636                                                                 matrix->values->begin()));
3637       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3638                                                                 matrix->values->end()));
3639       thrust::for_each(zibit,zieit,VecCUDAEquals());
3640     }
3641   }
3642   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3643   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3644 finalize:
3645   delete cooPerm_v;
3646   A->offloadmask = PETSC_OFFLOAD_GPU;
3647   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3648   /* shorter version of MatAssemblyEnd_SeqAIJ */
3649   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3650   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3651   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3652   a->reallocs         = 0;
3653   A->info.mallocs    += 0;
3654   A->info.nz_unneeded = 0;
3655   A->assembled = A->was_assembled = PETSC_TRUE;
3656   A->num_ass++;
3657   PetscFunctionReturn(0);
3658 }
3659 
3660 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3661 {
3662   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3663   PetscErrorCode     ierr;
3664 
3665   PetscFunctionBegin;
3666   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3667   if (!cusp) PetscFunctionReturn(0);
3668   if (destroy) {
3669     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3670     delete cusp->csr2csc_i;
3671     cusp->csr2csc_i = NULL;
3672   }
3673   A->transupdated = PETSC_FALSE;
3674   PetscFunctionReturn(0);
3675 }
3676 
3677 #include <thrust/binary_search.h>
3678 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3679 {
3680   PetscErrorCode     ierr;
3681   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3682   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3683   PetscInt           cooPerm_n, nzr = 0;
3684   cudaError_t        cerr;
3685 
3686   PetscFunctionBegin;
3687   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3688   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3689   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3690   if (n != cooPerm_n) {
3691     delete cusp->cooPerm;
3692     delete cusp->cooPerm_a;
3693     cusp->cooPerm = NULL;
3694     cusp->cooPerm_a = NULL;
3695   }
3696   if (n) {
3697     THRUSTINTARRAY d_i(n);
3698     THRUSTINTARRAY d_j(n);
3699     THRUSTINTARRAY ii(A->rmap->n);
3700 
3701     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3702     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3703 
3704     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3705     d_i.assign(coo_i,coo_i+n);
3706     d_j.assign(coo_j,coo_j+n);
3707     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3708     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3709 
3710     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3711     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3712     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3713     *cusp->cooPerm_a = d_i;
3714     THRUSTINTARRAY w = d_j;
3715 
3716     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3717     if (nekey == ekey) { /* all entries are unique */
3718       delete cusp->cooPerm_a;
3719       cusp->cooPerm_a = NULL;
3720     } else { /* I couldn't come up with a more elegant algorithm */
3721       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3722       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3723       (*cusp->cooPerm_a)[0] = 0;
3724       w[0] = 0;
3725       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3726       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3727     }
3728     thrust::counting_iterator<PetscInt> search_begin(0);
3729     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3730                         search_begin, search_begin + A->rmap->n,
3731                         ii.begin());
3732     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3733     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3734 
3735     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3736     a->singlemalloc = PETSC_FALSE;
3737     a->free_a       = PETSC_TRUE;
3738     a->free_ij      = PETSC_TRUE;
3739     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3740     a->i[0] = 0;
3741     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3742     a->nz = a->maxnz = a->i[A->rmap->n];
3743     a->rmax = 0;
3744     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3745     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3746     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3747     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3748     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3749     for (PetscInt i = 0; i < A->rmap->n; i++) {
3750       const PetscInt nnzr = a->i[i+1] - a->i[i];
3751       nzr += (PetscInt)!!(nnzr);
3752       a->ilen[i] = a->imax[i] = nnzr;
3753       a->rmax = PetscMax(a->rmax,nnzr);
3754     }
3755     a->nonzerorowcnt = nzr;
3756     A->preallocated = PETSC_TRUE;
3757     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3758     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3759   } else {
3760     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3761   }
3762   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3763 
3764   /* We want to allocate the CUSPARSE struct for matvec now.
3765      The code is so convoluted now that I prefer to copy zeros */
3766   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3767   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3768   A->offloadmask = PETSC_OFFLOAD_CPU;
3769   A->nonzerostate++;
3770   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3771   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3772 
3773   A->assembled = PETSC_FALSE;
3774   A->was_assembled = PETSC_FALSE;
3775   PetscFunctionReturn(0);
3776 }
3777 
3778 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3779 {
3780   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3781   CsrMatrix          *csr;
3782   PetscErrorCode     ierr;
3783 
3784   PetscFunctionBegin;
3785   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3786   PetscValidPointer(a,2);
3787   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3788   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3789   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3790   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3791   csr = (CsrMatrix*)cusp->mat->mat;
3792   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3793   *a = csr->values->data().get();
3794   PetscFunctionReturn(0);
3795 }
3796 
3797 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3798 {
3799   PetscFunctionBegin;
3800   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801   PetscValidPointer(a,2);
3802   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3803   *a = NULL;
3804   PetscFunctionReturn(0);
3805 }
3806 
3807 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3808 {
3809   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3810   CsrMatrix          *csr;
3811   PetscErrorCode     ierr;
3812 
3813   PetscFunctionBegin;
3814   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3815   PetscValidPointer(a,2);
3816   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3817   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3818   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3819   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3820   csr = (CsrMatrix*)cusp->mat->mat;
3821   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3822   *a = csr->values->data().get();
3823   A->offloadmask = PETSC_OFFLOAD_GPU;
3824   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3825   PetscFunctionReturn(0);
3826 }
3827 
3828 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3829 {
3830   PetscErrorCode ierr;
3831 
3832   PetscFunctionBegin;
3833   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3834   PetscValidPointer(a,2);
3835   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3836   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3837   *a = NULL;
3838   PetscFunctionReturn(0);
3839 }
3840 
3841 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3842 {
3843   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3844   CsrMatrix          *csr;
3845   PetscErrorCode     ierr;
3846 
3847   PetscFunctionBegin;
3848   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3849   PetscValidPointer(a,2);
3850   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3851   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3852   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3853   csr = (CsrMatrix*)cusp->mat->mat;
3854   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3855   *a = csr->values->data().get();
3856   A->offloadmask = PETSC_OFFLOAD_GPU;
3857   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3858   PetscFunctionReturn(0);
3859 }
3860 
3861 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3862 {
3863   PetscErrorCode ierr;
3864 
3865   PetscFunctionBegin;
3866   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3867   PetscValidPointer(a,2);
3868   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3869   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3870   *a = NULL;
3871   PetscFunctionReturn(0);
3872 }
3873 
3874 struct IJCompare4
3875 {
3876   __host__ __device__
3877   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3878   {
3879     if (t1.get<0>() < t2.get<0>()) return true;
3880     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3881     return false;
3882   }
3883 };
3884 
3885 struct Shift
3886 {
3887   int _shift;
3888 
3889   Shift(int shift) : _shift(shift) {}
3890   __host__ __device__
3891   inline int operator() (const int &c)
3892   {
3893     return c + _shift;
3894   }
3895 };
3896 
3897 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3898 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3899 {
3900   PetscErrorCode               ierr;
3901   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3902   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3903   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3904   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3905   PetscInt                     Annz,Bnnz;
3906   cusparseStatus_t             stat;
3907   PetscInt                     i,m,n,zero = 0;
3908   cudaError_t                  cerr;
3909 
3910   PetscFunctionBegin;
3911   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3912   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3913   PetscValidPointer(C,4);
3914   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3915   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3916   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3917   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3918   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3919   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3920   if (reuse == MAT_INITIAL_MATRIX) {
3921     m     = A->rmap->n;
3922     n     = A->cmap->n + B->cmap->n;
3923     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3924     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3925     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3926     c     = (Mat_SeqAIJ*)(*C)->data;
3927     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3928     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3929     Ccsr  = new CsrMatrix;
3930     Cmat->cprowIndices      = NULL;
3931     c->compressedrow.use    = PETSC_FALSE;
3932     c->compressedrow.nrows  = 0;
3933     c->compressedrow.i      = NULL;
3934     c->compressedrow.rindex = NULL;
3935     Ccusp->workVector       = NULL;
3936     Ccusp->nrows    = m;
3937     Ccusp->mat      = Cmat;
3938     Ccusp->mat->mat = Ccsr;
3939     Ccsr->num_rows  = m;
3940     Ccsr->num_cols  = n;
3941     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3942     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3943     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3944     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3945     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3946     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3947     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3948     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3949     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3950     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3951     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3952     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3953     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3954     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3955     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3956 
3957     Acsr = (CsrMatrix*)Acusp->mat->mat;
3958     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3959     Annz = (PetscInt)Acsr->column_indices->size();
3960     Bnnz = (PetscInt)Bcsr->column_indices->size();
3961     c->nz = Annz + Bnnz;
3962     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3963     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3964     Ccsr->values = new THRUSTARRAY(c->nz);
3965     Ccsr->num_entries = c->nz;
3966     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3967     if (c->nz) {
3968       auto Acoo = new THRUSTINTARRAY32(Annz);
3969       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3970       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3971       THRUSTINTARRAY32 *Aroff,*Broff;
3972 
3973       if (a->compressedrow.use) { /* need full row offset */
3974         if (!Acusp->rowoffsets_gpu) {
3975           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3976           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3977           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3978         }
3979         Aroff = Acusp->rowoffsets_gpu;
3980       } else Aroff = Acsr->row_offsets;
3981       if (b->compressedrow.use) { /* need full row offset */
3982         if (!Bcusp->rowoffsets_gpu) {
3983           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3984           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3985           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3986         }
3987         Broff = Bcusp->rowoffsets_gpu;
3988       } else Broff = Bcsr->row_offsets;
3989       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3990       stat = cusparseXcsr2coo(Acusp->handle,
3991                               Aroff->data().get(),
3992                               Annz,
3993                               m,
3994                               Acoo->data().get(),
3995                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3996       stat = cusparseXcsr2coo(Bcusp->handle,
3997                               Broff->data().get(),
3998                               Bnnz,
3999                               m,
4000                               Bcoo->data().get(),
4001                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4002       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4003       auto Aperm = thrust::make_constant_iterator(1);
4004       auto Bperm = thrust::make_constant_iterator(0);
4005 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4006       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4007       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4008 #else
4009       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4010       auto Bcib = Bcsr->column_indices->begin();
4011       auto Bcie = Bcsr->column_indices->end();
4012       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4013 #endif
4014       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4015       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4016       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4017       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4018       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4019       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4020       auto p1 = Ccusp->cooPerm->begin();
4021       auto p2 = Ccusp->cooPerm->begin();
4022       thrust::advance(p2,Annz);
4023       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4024 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4025       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4026 #endif
4027       auto cci = thrust::make_counting_iterator(zero);
4028       auto cce = thrust::make_counting_iterator(c->nz);
4029 #if 0 //Errors on SUMMIT cuda 11.1.0
4030       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4031 #else
4032       auto pred = thrust::identity<int>();
4033       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4034       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4035 #endif
4036       stat = cusparseXcoo2csr(Ccusp->handle,
4037                               Ccoo->data().get(),
4038                               c->nz,
4039                               m,
4040                               Ccsr->row_offsets->data().get(),
4041                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4042       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4043       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4044       delete wPerm;
4045       delete Acoo;
4046       delete Bcoo;
4047       delete Ccoo;
4048 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4049       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4050                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4051                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4052                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4053 #endif
4054       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4055         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4056         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4057         CsrMatrix *CcsrT = new CsrMatrix;
4058         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4059         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4060 
4061         (*C)->form_explicit_transpose = PETSC_TRUE;
4062         (*C)->transupdated = PETSC_TRUE;
4063         Ccusp->rowoffsets_gpu = NULL;
4064         CmatT->cprowIndices = NULL;
4065         CmatT->mat = CcsrT;
4066         CcsrT->num_rows = n;
4067         CcsrT->num_cols = m;
4068         CcsrT->num_entries = c->nz;
4069 
4070         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4071         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4072         CcsrT->values = new THRUSTARRAY(c->nz);
4073 
4074         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4075         auto rT = CcsrT->row_offsets->begin();
4076         if (AT) {
4077           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4078           thrust::advance(rT,-1);
4079         }
4080         if (BT) {
4081           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4082           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4083           thrust::copy(titb,tite,rT);
4084         }
4085         auto cT = CcsrT->column_indices->begin();
4086         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4087         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4088         auto vT = CcsrT->values->begin();
4089         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4090         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4091         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4092         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4093 
4094         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4095         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4096         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4097         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4098         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4099         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4100         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4101         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4102         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4103 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4104         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4105                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4106                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4107                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4108 #endif
4109         Ccusp->matTranspose = CmatT;
4110       }
4111     }
4112 
4113     c->singlemalloc = PETSC_FALSE;
4114     c->free_a       = PETSC_TRUE;
4115     c->free_ij      = PETSC_TRUE;
4116     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4117     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4118     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4119       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4120       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4121       ii   = *Ccsr->row_offsets;
4122       jj   = *Ccsr->column_indices;
4123       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4124       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4125     } else {
4126       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4127       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4128     }
4129     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4130     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4131     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4132     c->maxnz = c->nz;
4133     c->nonzerorowcnt = 0;
4134     c->rmax = 0;
4135     for (i = 0; i < m; i++) {
4136       const PetscInt nn = c->i[i+1] - c->i[i];
4137       c->ilen[i] = c->imax[i] = nn;
4138       c->nonzerorowcnt += (PetscInt)!!nn;
4139       c->rmax = PetscMax(c->rmax,nn);
4140     }
4141     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4142     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4143     (*C)->nonzerostate++;
4144     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4145     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4146     Ccusp->nonzerostate = (*C)->nonzerostate;
4147     (*C)->preallocated  = PETSC_TRUE;
4148   } else {
4149     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4150     c = (Mat_SeqAIJ*)(*C)->data;
4151     if (c->nz) {
4152       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4153       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4154       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4155       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4156       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4157       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4158       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4159       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4160       Acsr = (CsrMatrix*)Acusp->mat->mat;
4161       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4162       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4163       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4164       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4165       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4166       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4167       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4168       auto pmid = Ccusp->cooPerm->begin();
4169       thrust::advance(pmid,Acsr->num_entries);
4170       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4171       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4172                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4173       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4174                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4175       thrust::for_each(zibait,zieait,VecCUDAEquals());
4176       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4177                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4178       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4179                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4180       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4181       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4182       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4183         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4184         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4185         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4186         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4187         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4188         auto vT = CcsrT->values->begin();
4189         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4190         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4191         (*C)->transupdated = PETSC_TRUE;
4192       }
4193       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4194       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4195     }
4196   }
4197   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4198   (*C)->assembled     = PETSC_TRUE;
4199   (*C)->was_assembled = PETSC_FALSE;
4200   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4201   PetscFunctionReturn(0);
4202 }
4203 
4204 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4205 {
4206   PetscErrorCode    ierr;
4207   bool              dmem;
4208   const PetscScalar *av;
4209   cudaError_t       cerr;
4210 
4211   PetscFunctionBegin;
4212   dmem = isCudaMem(v);
4213   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4214   if (n && idx) {
4215     THRUSTINTARRAY widx(n);
4216     widx.assign(idx,idx+n);
4217     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4218 
4219     THRUSTARRAY *w = NULL;
4220     thrust::device_ptr<PetscScalar> dv;
4221     if (dmem) {
4222       dv = thrust::device_pointer_cast(v);
4223     } else {
4224       w = new THRUSTARRAY(n);
4225       dv = w->data();
4226     }
4227     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4228 
4229     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4230     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4231     thrust::for_each(zibit,zieit,VecCUDAEquals());
4232     if (w) {
4233       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4234     }
4235     delete w;
4236   } else {
4237     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4238   }
4239   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4240   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4241   PetscFunctionReturn(0);
4242 }
4243 
4244 /*
4245   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4246 
4247   requires:
4248      structurally symmetric: fix with transpose/column meta data
4249 */
4250 
4251 /*
4252   The GPU LU factor kernel
4253 */
4254 __global__
4255 void __launch_bounds__(1024,1)
4256 mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4257 {
4258   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4259   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4260   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4261 
4262   // set i (row+1)
4263   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4264   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4265   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4266     if (rowb < end_i && threadIdx.x==0) {
4267       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4268       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4269     }
4270   }
4271 }
4272 // copy AIJ to AIJ_BAND
4273 __global__
4274 void __launch_bounds__(1024,1)
4275 mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4276                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4277                                 const int bi_csr[], PetscScalar ba_csr[])
4278 {
4279   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4280   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4281   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4282 
4283   // zero B
4284   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4285   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4286     if (rowb < end_i) {
4287       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4288       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4289       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4290         if (j<nzb) {
4291           batmp[j] = 0;
4292         }
4293       }
4294     }
4295   }
4296 
4297   // copy A into B with CSR format -- these two loops can be fused
4298   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4299     if (rowb < end_i) {
4300       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4301       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4302       const PetscScalar *av    = aa_d + ai_d[rowa];
4303       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4304       /* load in initial (unfactored row) */
4305       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4306         if (j<nza) {
4307           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4308           PetscScalar vala = av[j];
4309           batmp[idx] = vala;
4310         }
4311       }
4312     }
4313   }
4314 }
4315 // print AIJ_BAND
4316 __global__
4317 void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4318 {
4319   // debug
4320   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4321     printf("B (AIJ) n=%d:\n",(int)n);
4322     for (int rowb=0;rowb<n;rowb++) {
4323       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4324       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4325       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4326       printf(" bi=%d\n",bi_csr[rowb+1]);
4327     }
4328   }
4329 }
4330 // Band LU kernel ---  ba_csr bi_csr
4331 __global__
4332 void __launch_bounds__(1024,1)
4333 mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4334 {
4335   extern __shared__ PetscInt smemInt[];
4336   PetscInt        *sm_pkIdx  = &smemInt[0];
4337   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4338   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4339   const PetscInt  start = field*nloc, end = start + nloc;
4340 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4341   auto g = cooperative_groups::this_grid();
4342 #endif
4343   // A22 panel update for each row A(1,:) and col A(:,1)
4344   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4345     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4346     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4347     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4348     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4349     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4350     const PetscScalar Bdd = *pBdd;
4351     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4352     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4353       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4354         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4355         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4356         *Aid = *Aid/Bdd;
4357         sm_pkIdx[threadIdx.y] = kIdx;
4358       }
4359       __syncthreads(); // synch on threadIdx.x only
4360       if (idx < nzUd) { /* assuming symmetric structure */
4361         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4362         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4363         PetscScalar *Aij =  Aid + 1;
4364         PetscScalar Lid  = *Aid;
4365         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4366           if (jIdx<nzUd) {
4367             Aij[jIdx] -= Lid*baUd[jIdx];
4368           }
4369         }
4370       }
4371     }
4372 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4373     g.sync();
4374 #else
4375     __syncthreads();
4376 #endif
4377   } /* endof for (i=0; i<n; i++) { */
4378 }
4379 
4380 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4381 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4382 {
4383   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4384   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4385   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4386   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4387   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4388   CsrMatrix                    *matrixA;
4389   PetscErrorCode               ierr;
4390   cudaError_t                  cerr;
4391   const PetscInt               n=A->rmap->n, *ic, *r;
4392   const int                    *ai_d, *aj_d;
4393   const PetscScalar            *aa_d;
4394   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4395   int                          *bi_t = cusparseTriFactors->i_band_d;
4396   PetscContainer               container;
4397   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4398 
4399   PetscFunctionBegin;
4400   if (A->rmap->n == 0) {
4401     PetscFunctionReturn(0);
4402   }
4403   // cusparse setup
4404   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4405   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4406   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4407   matrixA = (CsrMatrix*)matstructA->mat;
4408   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4409 
4410   // factor: get Nf if available
4411   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4412   if (container) {
4413     PetscInt *pNf=NULL;
4414     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4415     Nf = (*pNf)%1000;
4416     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4417   } else Nf = 1;
4418   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4419 
4420   // get data
4421   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4422   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4423   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4424   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4425   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4426 
4427   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4428   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4429   {
4430     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4431     int gpuid;
4432     cudaDeviceProp prop;
4433     cudaGetDevice(&gpuid);
4434     cudaGetDeviceProperties(&prop, gpuid);
4435 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4436     Ni = 1/nconcurrent;
4437     Ni = 1;
4438 #else
4439     nsm = prop.multiProcessorCount;
4440     Ni = nsm/Nf/nconcurrent;
4441 #endif
4442     team_size = bw/Ni + !!(bw%Ni);
4443     nVec = PetscMin(bw, 1024/team_size);
4444     ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr);
4445     {
4446       dim3 dimBlockTeam(nVec,team_size);
4447       dim3 dimBlockLeague(Nf,Ni);
4448       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4449       CHECK_LAUNCH_ERROR(); // does a sync
4450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4451       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4452       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4453 #else
4454       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4455 #endif
4456       CHECK_LAUNCH_ERROR(); // does a sync
4457 #if defined(PETSC_USE_LOG)
4458       ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr);
4459 #endif
4460     }
4461   }
4462   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4463 
4464   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4465   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4466   B->ops->solvetranspose = NULL; // need transpose
4467   B->ops->matsolve = NULL;
4468   B->ops->matsolvetranspose = NULL;
4469 
4470   PetscFunctionReturn(0);
4471 }
4472 
4473 static PetscErrorCode MatrixNfDestroy(void *ptr)
4474 {
4475   PetscInt *nf = (PetscInt *)ptr;
4476   PetscErrorCode  ierr;
4477   PetscFunctionBegin;
4478   ierr = PetscFree(nf);CHKERRQ(ierr);
4479   PetscFunctionReturn(0);
4480 }
4481 
4482 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4483 {
4484   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4485   IS                 isicol;
4486   PetscErrorCode     ierr;
4487   cudaError_t        cerr;
4488   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4489   PetscScalar        *ba_t;
4490   int                *bi_t;
4491   PetscInt           i,n=A->rmap->n,Nf;
4492   PetscInt           nzBcsr,bwL,bwU;
4493   PetscBool          missing;
4494   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4495   PetscContainer               container;
4496 
4497   PetscFunctionBegin;
4498   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4499   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
4500   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4501   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4502   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr);
4503   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4504 
4505    // factor: get Nf if available
4506   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4507   if (container) {
4508     PetscInt *pNf=NULL;
4509     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4510     Nf = (*pNf)%1000;
4511     ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr);
4512     ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr);
4513     *pNf = Nf;
4514     ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr);
4515     ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr);
4516     ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr);
4517     ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
4518   } else Nf = 1;
4519   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4520 
4521   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4522   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4523 
4524   ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4525   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr);
4526   b    = (Mat_SeqAIJ*)(B)->data;
4527 
4528   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4529   bwL = bwU = 0;
4530   for (int rwb=0; rwb<n; rwb++) {
4531     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4532     for (int j=0;j<anz;j++) {
4533       PetscInt colb = ic[ajtmp[j]];
4534       if (colb<rwa) { // L
4535         if (rwa-colb > bwL) bwL = rwa-colb;
4536       } else {
4537         if (colb-rwa > bwU) bwU = colb-rwa;
4538       }
4539     }
4540   }
4541   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4542   /* only support structurally symmetric, but it might work */
4543   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4544   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
4545   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4546   b->maxnz = b->nz = nzBcsr;
4547   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4548   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4549   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4550   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4551   cusparseTriFactors->a_band_d = ba_t;
4552   cusparseTriFactors->i_band_d = bi_t;
4553   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4554   ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr);
4555   {
4556     dim3 dimBlockTeam(1,128);
4557     dim3 dimBlockLeague(Nf,1);
4558     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4559   }
4560   CHECK_LAUNCH_ERROR(); // does a sync
4561 
4562   // setup data
4563   if (!cusparseTriFactors->rpermIndices) {
4564     const PetscInt *r;
4565 
4566     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4567     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4568     cusparseTriFactors->rpermIndices->assign(r, r+n);
4569     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4570     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4571   }
4572   /* upper triangular indices */
4573   if (!cusparseTriFactors->cpermIndices) {
4574     const PetscInt *c;
4575 
4576     ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr);
4577     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4578     cusparseTriFactors->cpermIndices->assign(c, c+n);
4579     ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr);
4580     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4581   }
4582 
4583   /* put together the new matrix */
4584   b->free_a       = PETSC_FALSE;
4585   b->free_ij      = PETSC_FALSE;
4586   b->singlemalloc = PETSC_FALSE;
4587   b->ilen = NULL;
4588   b->imax = NULL;
4589   b->row  = isrow;
4590   b->col  = iscol;
4591   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4592   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4593   b->icol = isicol;
4594   ierr    = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr);
4595 
4596   B->factortype            = MAT_FACTOR_LU;
4597   B->info.factor_mallocs   = 0;
4598   B->info.fill_ratio_given = 0;
4599 
4600   if (ai[n]) {
4601     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4602   } else {
4603     B->info.fill_ratio_needed = 0.0;
4604   }
4605 #if defined(PETSC_USE_INFO)
4606   if (ai[n] != 0) {
4607     PetscReal af = B->info.fill_ratio_needed;
4608     ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr);
4609   } else {
4610     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
4611   }
4612 #endif
4613   if (a->inode.size) {
4614     ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr);
4615   }
4616   ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr);
4617   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4618   B->offloadmask = PETSC_OFFLOAD_GPU;
4619 
4620   PetscFunctionReturn(0);
4621 }
4622 
4623 /* Use -pc_factor_mat_solver_type cusparseband */
4624 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4625 {
4626   PetscFunctionBegin;
4627   *type = MATSOLVERCUSPARSEBAND;
4628   PetscFunctionReturn(0);
4629 }
4630 
4631 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4632 {
4633   PetscErrorCode ierr;
4634   PetscInt       n = A->rmap->n;
4635 
4636   PetscFunctionBegin;
4637   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
4638   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
4639   (*B)->factortype = ftype;
4640   (*B)->useordering = PETSC_TRUE;
4641   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4642 
4643   if (ftype == MAT_FACTOR_LU) {
4644     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
4645     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4646     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4647   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4648 
4649   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4650   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr);
4651   PetscFunctionReturn(0);
4652 }
4653 
4654 #define WARP_SIZE 32
4655 template <typename T>
4656 __forceinline__ __device__
4657 T wreduce(T a)
4658 {
4659   T b;
4660   #pragma unroll
4661   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4662     b = __shfl_down_sync(0xffffffff, a, i);
4663     a += b;
4664   }
4665   return a;
4666 }
4667 // reduce in a block, returns result in thread 0
4668 template <typename T, int BLOCK_SIZE>
4669 __device__
4670 T breduce(T a)
4671 {
4672   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4673   __shared__ double buf[NWARP];
4674   int wid = threadIdx.x / WARP_SIZE;
4675   int laneid = threadIdx.x % WARP_SIZE;
4676   T b = wreduce<T>(a);
4677   if (laneid == 0)
4678     buf[wid] = b;
4679   __syncthreads();
4680   if (wid == 0) {
4681     if (threadIdx.x < NWARP)
4682       a = buf[threadIdx.x];
4683     else
4684       a = 0;
4685     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4686       a += __shfl_down_sync(0xffffffff, a, i);
4687     }
4688   }
4689   return a;
4690 }
4691 
4692 
4693 // Band LU kernel ---  ba_csr bi_csr
4694 template <int BLOCK_SIZE>
4695 __global__
4696 void __launch_bounds__(256,1)
4697 mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4698 {
4699   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4700   const PetscScalar *pLi;
4701   const int tid = threadIdx.x;
4702 
4703   /* Next, solve L */
4704   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4705   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4706     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4707     PetscScalar t = 0;
4708     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4709       t += pLi[idx]*x[j];
4710     }
4711 #if defined(PETSC_USE_COMPLEX)
4712     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4713     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4714     t = tt;
4715 #else
4716     t = breduce<PetscReal,BLOCK_SIZE>(t);
4717 #endif
4718     if (threadIdx.x == 0)
4719       x[glbDD] -= t; // /1.0
4720     __syncthreads();
4721     // inc
4722     pLi += glbDD-col; // get to diagonal
4723     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4724     else pLi += bw;
4725     pLi += 1; // skip to next row
4726     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4727   }
4728   /* Then, solve U */
4729   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4730   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4731   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4732     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4733     PetscScalar t = 0;
4734     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4735       t += pLi[-idx]*x[j];
4736     }
4737 #if defined(PETSC_USE_COMPLEX)
4738     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4739     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4740     t = tt;
4741 #else
4742     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4743 #endif
4744     pLi -= col-glbDD; // diagonal
4745     if (threadIdx.x == 0) {
4746       x[glbDD] -= t;
4747       x[glbDD] /= pLi[0];
4748     }
4749     __syncthreads();
4750     // inc past L to start of previous U
4751     pLi -= bw+1;
4752     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4753     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4754   }
4755 }
4756 
4757 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4758 {
4759   const PetscScalar                     *barray;
4760   PetscScalar                           *xarray;
4761   thrust::device_ptr<const PetscScalar> bGPU;
4762   thrust::device_ptr<PetscScalar>       xGPU;
4763   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4764   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4765   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4766   PetscErrorCode                        ierr;
4767   cudaError_t                           cerr;
4768   PetscContainer                        container;
4769 
4770   PetscFunctionBegin;
4771   if (A->rmap->n == 0) {
4772     PetscFunctionReturn(0);
4773   }
4774   // factor: get Nf if available
4775   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4776   if (container) {
4777     PetscInt *pNf=NULL;
4778     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4779     Nf = (*pNf)%1000;
4780   } else Nf = 1;
4781   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4782 
4783   /* Get the GPU pointers */
4784   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
4785   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
4786   xGPU = thrust::device_pointer_cast(xarray);
4787   bGPU = thrust::device_pointer_cast(barray);
4788 
4789   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4790   /* First, reorder with the row permutation */
4791   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4792                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4793                tempGPU->begin());
4794   constexpr int block = 128;
4795   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4796   CHECK_LAUNCH_ERROR(); // does a sync
4797 
4798   /* Last, reorder with the column permutation */
4799   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4800                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4801                xGPU);
4802 
4803   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
4804   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
4805   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4806   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4807   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
4808   PetscFunctionReturn(0);
4809 }
4810