xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision af4fa82cc29c77689f3cd2af837601dbdc3602c2)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18 #include <cooperative_groups.h>
19 #endif
20 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
24 
25   typedef enum {
26       CUSPARSE_MV_ALG_DEFAULT = 0,
27       CUSPARSE_COOMV_ALG      = 1,
28       CUSPARSE_CSRMV_ALG1     = 2,
29       CUSPARSE_CSRMV_ALG2     = 3
30   } cusparseSpMVAlg_t;
31 
32   typedef enum {
33       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
35       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
36       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
37       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
38       CUSPARSE_SPMM_ALG_DEFAULT = 0,
39       CUSPARSE_SPMM_COO_ALG1    = 1,
40       CUSPARSE_SPMM_COO_ALG2    = 2,
41       CUSPARSE_SPMM_COO_ALG3    = 3,
42       CUSPARSE_SPMM_COO_ALG4    = 5,
43       CUSPARSE_SPMM_CSR_ALG1    = 4,
44       CUSPARSE_SPMM_CSR_ALG2    = 6,
45   } cusparseSpMMAlg_t;
46 
47   typedef enum {
48       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
50   } cusparseCsr2CscAlg_t;
51   */
52   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55 #endif
56 
57 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
60 
61 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
88 
89 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
92 
93 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
94 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
95 
96 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
97 
98 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99 {
100   cusparseStatus_t   stat;
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102 
103   PetscFunctionBegin;
104   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105   cusparsestruct->stream = stream;
106   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107   PetscFunctionReturn(0);
108 }
109 
110 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111 {
112   cusparseStatus_t   stat;
113   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114 
115   PetscFunctionBegin;
116   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117   if (cusparsestruct->handle != handle) {
118     if (cusparsestruct->handle) {
119       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120     }
121     cusparsestruct->handle = handle;
122   }
123   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128 {
129   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130   PetscBool          flg;
131   PetscErrorCode     ierr;
132 
133   PetscFunctionBegin;
134   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
135   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
136   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137   PetscFunctionReturn(0);
138 }
139 
140 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141 {
142   PetscFunctionBegin;
143   *type = MATSOLVERCUSPARSE;
144   PetscFunctionReturn(0);
145 }
146 
147 /*MC
148   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153   algorithms are not recommended. This class does NOT support direct solver operations.
154 
155   Level: beginner
156 
157 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 M*/
159 
160 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161 {
162   PetscErrorCode ierr;
163   PetscInt       n = A->rmap->n;
164 
165   PetscFunctionBegin;
166   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
167   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
168   (*B)->factortype = ftype;
169   (*B)->useordering = PETSC_TRUE;
170   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
171 
172   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
173     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
174     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
179   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
180 
181   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
182   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187 {
188   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
189 
190   PetscFunctionBegin;
191   switch (op) {
192   case MAT_CUSPARSE_MULT:
193     cusparsestruct->format = format;
194     break;
195   case MAT_CUSPARSE_ALL:
196     cusparsestruct->format = format;
197     break;
198   default:
199     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 /*@
205    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206    operation. Only the MatMult operation can use different GPU storage formats
207    for MPIAIJCUSPARSE matrices.
208    Not Collective
209 
210    Input Parameters:
211 +  A - Matrix of type SEQAIJCUSPARSE
212 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214 
215    Output Parameter:
216 
217    Level: intermediate
218 
219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220 @*/
221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222 {
223   PetscErrorCode ierr;
224 
225   PetscFunctionBegin;
226   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
227   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   switch (op) {
237     case MAT_FORM_EXPLICIT_TRANSPOSE:
238       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
240       A->form_explicit_transpose = flg;
241       break;
242     default:
243       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
244       break;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250 
251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252 {
253   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254   IS             isrow = b->row,iscol = b->col;
255   PetscBool      row_identity,col_identity;
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
260   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
261   B->offloadmask = PETSC_OFFLOAD_CPU;
262   /* determine which version of MatSolve needs to be used. */
263   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
264   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
265   if (row_identity && col_identity) {
266     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268     B->ops->matsolve = NULL;
269     B->ops->matsolvetranspose = NULL;
270   } else {
271     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273     B->ops->matsolve = NULL;
274     B->ops->matsolvetranspose = NULL;
275   }
276 
277   /* get the triangular factors */
278   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
279   PetscFunctionReturn(0);
280 }
281 
282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283 {
284   PetscErrorCode           ierr;
285   MatCUSPARSEStorageFormat format;
286   PetscBool                flg;
287   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
288 
289   PetscFunctionBegin;
290   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
291   if (A->factortype == MAT_FACTOR_NONE) {
292     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
294     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
295 
296     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
298     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
299    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
302     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 
305     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
307     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
308 
309     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
311     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312    #endif
313   }
314   ierr = PetscOptionsTail();CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319 {
320   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321   PetscErrorCode               ierr;
322 
323   PetscFunctionBegin;
324   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
325   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
326   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327   PetscFunctionReturn(0);
328 }
329 
330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331 {
332   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333   PetscErrorCode               ierr;
334 
335   PetscFunctionBegin;
336   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
337   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
338   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339   PetscFunctionReturn(0);
340 }
341 
342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343 {
344   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345   PetscErrorCode               ierr;
346 
347   PetscFunctionBegin;
348   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
349   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
350   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351   PetscFunctionReturn(0);
352 }
353 
354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355 {
356   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357   PetscErrorCode               ierr;
358 
359   PetscFunctionBegin;
360   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
361   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
362   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363   PetscFunctionReturn(0);
364 }
365 
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiLo, *AjLo;
376   PetscInt                          i,nz, nzLower, offset, rowOffset;
377   PetscErrorCode                    ierr;
378   cudaError_t                       cerr;
379 
380   PetscFunctionBegin;
381   if (!n) PetscFunctionReturn(0);
382   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383     try {
384       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385       nzLower=n+ai[n]-ai[1];
386       if (!loTriFactor) {
387         PetscScalar                       *AALo;
388 
389         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
390 
391         /* Allocate Space for the lower triangular matrix */
392         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
394 
395         /* Fill the lower triangular matrix */
396         AiLo[0]  = (PetscInt) 0;
397         AiLo[n]  = nzLower;
398         AjLo[0]  = (PetscInt) 0;
399         AALo[0]  = (MatScalar) 1.0;
400         v        = aa;
401         vi       = aj;
402         offset   = 1;
403         rowOffset= 1;
404         for (i=1; i<n; i++) {
405           nz = ai[i+1] - ai[i];
406           /* additional 1 for the term on the diagonal */
407           AiLo[i]    = rowOffset;
408           rowOffset += nz+1;
409 
410           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
411           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
412 
413           offset      += nz;
414           AjLo[offset] = (PetscInt) i;
415           AALo[offset] = (MatScalar) 1.0;
416           offset      += 1;
417 
418           v  += nz;
419           vi += nz;
420         }
421 
422         /* allocate space for the triangular factor information */
423         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
424         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425         /* Create the matrix description */
426         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430        #else
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432        #endif
433         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
435 
436         /* set the operation */
437         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439         /* set the matrix */
440         loTriFactor->csrMat = new CsrMatrix;
441         loTriFactor->csrMat->num_rows = n;
442         loTriFactor->csrMat->num_cols = n;
443         loTriFactor->csrMat->num_entries = nzLower;
444 
445         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
447 
448         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
450 
451         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
453 
454         /* Create the solve analysis information */
455         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
456         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464       #endif
465 
466         /* perform the solve analysis */
467         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473                                #endif
474 );CHKERRCUSPARSE(stat);
475         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
477 
478         /* assign the pointer */
479         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480         loTriFactor->AA_h = AALo;
481         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       } else { /* update values only */
485         if (!loTriFactor->AA_h) {
486           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487         }
488         /* Fill the lower triangular matrix */
489         loTriFactor->AA_h[0]  = 1.0;
490         v        = aa;
491         vi       = aj;
492         offset   = 1;
493         for (i=1; i<n; i++) {
494           nz = ai[i+1] - ai[i];
495           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
496           offset      += nz;
497           loTriFactor->AA_h[offset] = 1.0;
498           offset      += 1;
499           v  += nz;
500         }
501         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
503       }
504     } catch(char *ex) {
505       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506     }
507   }
508   PetscFunctionReturn(0);
509 }
510 
511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512 {
513   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514   PetscInt                          n = A->rmap->n;
515   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517   cusparseStatus_t                  stat;
518   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519   const MatScalar                   *aa = a->a,*v;
520   PetscInt                          *AiUp, *AjUp;
521   PetscInt                          i,nz, nzUpper, offset;
522   PetscErrorCode                    ierr;
523   cudaError_t                       cerr;
524 
525   PetscFunctionBegin;
526   if (!n) PetscFunctionReturn(0);
527   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528     try {
529       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530       nzUpper = adiag[0]-adiag[n];
531       if (!upTriFactor) {
532         PetscScalar *AAUp;
533 
534         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
535 
536         /* Allocate Space for the upper triangular matrix */
537         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
539 
540         /* Fill the upper triangular matrix */
541         AiUp[0]=(PetscInt) 0;
542         AiUp[n]=nzUpper;
543         offset = nzUpper;
544         for (i=n-1; i>=0; i--) {
545           v  = aa + adiag[i+1] + 1;
546           vi = aj + adiag[i+1] + 1;
547 
548           /* number of elements NOT on the diagonal */
549           nz = adiag[i] - adiag[i+1]-1;
550 
551           /* decrement the offset */
552           offset -= (nz+1);
553 
554           /* first, set the diagonal elements */
555           AjUp[offset] = (PetscInt) i;
556           AAUp[offset] = (MatScalar)1./v[nz];
557           AiUp[i]      = AiUp[i+1] - (nz+1);
558 
559           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
560           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
561         }
562 
563         /* allocate space for the triangular factor information */
564         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
565         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
566 
567         /* Create the matrix description */
568         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572        #else
573         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574        #endif
575         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
577 
578         /* set the operation */
579         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580 
581         /* set the matrix */
582         upTriFactor->csrMat = new CsrMatrix;
583         upTriFactor->csrMat->num_rows = n;
584         upTriFactor->csrMat->num_cols = n;
585         upTriFactor->csrMat->num_entries = nzUpper;
586 
587         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589 
590         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592 
593         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595 
596         /* Create the solve analysis information */
597         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
598         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606       #endif
607 
608         /* perform the solve analysis */
609         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615                                #endif
616 );CHKERRCUSPARSE(stat);
617         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
619 
620         /* assign the pointer */
621         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622         upTriFactor->AA_h = AAUp;
623         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
626       } else {
627         if (!upTriFactor->AA_h) {
628           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629         }
630         /* Fill the upper triangular matrix */
631         offset = nzUpper;
632         for (i=n-1; i>=0; i--) {
633           v  = aa + adiag[i+1] + 1;
634 
635           /* number of elements NOT on the diagonal */
636           nz = adiag[i] - adiag[i+1]-1;
637 
638           /* decrement the offset */
639           offset -= (nz+1);
640 
641           /* first, set the diagonal elements */
642           upTriFactor->AA_h[offset] = 1./v[nz];
643           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
644         }
645         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
647       }
648     } catch(char *ex) {
649       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650     }
651   }
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656 {
657   PetscErrorCode               ierr;
658   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660   IS                           isrow = a->row,iscol = a->icol;
661   PetscBool                    row_identity,col_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
667   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
668 
669   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670   cusparseTriFactors->nnz=a->nz;
671 
672   A->offloadmask = PETSC_OFFLOAD_BOTH;
673   /* lower triangular indices */
674   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
675   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676     const PetscInt *r;
677 
678     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
679     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680     cusparseTriFactors->rpermIndices->assign(r, r+n);
681     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
682     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
683   }
684 
685   /* upper triangular indices */
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688     const PetscInt *c;
689 
690     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
691     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692     cusparseTriFactors->cpermIndices->assign(c, c+n);
693     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
694     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700 {
701   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705   cusparseStatus_t                  stat;
706   PetscErrorCode                    ierr;
707   cudaError_t                       cerr;
708   PetscInt                          *AiUp, *AjUp;
709   PetscScalar                       *AAUp;
710   PetscScalar                       *AALo;
711   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714   const MatScalar                   *aa = b->a,*v;
715 
716   PetscFunctionBegin;
717   if (!n) PetscFunctionReturn(0);
718   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719     try {
720       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722       if (!upTriFactor && !loTriFactor) {
723         /* Allocate Space for the upper triangular matrix */
724         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
726 
727         /* Fill the upper triangular matrix */
728         AiUp[0]=(PetscInt) 0;
729         AiUp[n]=nzUpper;
730         offset = 0;
731         for (i=0; i<n; i++) {
732           /* set the pointers */
733           v  = aa + ai[i];
734           vj = aj + ai[i];
735           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
736 
737           /* first, set the diagonal elements */
738           AjUp[offset] = (PetscInt) i;
739           AAUp[offset] = (MatScalar)1.0/v[nz];
740           AiUp[i]      = offset;
741           AALo[offset] = (MatScalar)1.0/v[nz];
742 
743           offset+=1;
744           if (nz>0) {
745             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
746             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
747             for (j=offset; j<offset+nz; j++) {
748               AAUp[j] = -AAUp[j];
749               AALo[j] = AAUp[j]/v[nz];
750             }
751             offset+=nz;
752           }
753         }
754 
755         /* allocate space for the triangular factor information */
756         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
757         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
758 
759         /* Create the matrix description */
760         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764        #else
765         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766        #endif
767         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
769 
770         /* set the matrix */
771         upTriFactor->csrMat = new CsrMatrix;
772         upTriFactor->csrMat->num_rows = A->rmap->n;
773         upTriFactor->csrMat->num_cols = A->cmap->n;
774         upTriFactor->csrMat->num_entries = a->nz;
775 
776         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
778 
779         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
781 
782         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
784 
785         /* set the operation */
786         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
787 
788         /* Create the solve analysis information */
789         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
790         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798       #endif
799 
800         /* perform the solve analysis */
801         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807                                 #endif
808 );CHKERRCUSPARSE(stat);
809         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
811 
812         /* assign the pointer */
813         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814 
815         /* allocate space for the triangular factor information */
816         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
817         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818 
819         /* Create the matrix description */
820         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824        #else
825         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826        #endif
827         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
829 
830         /* set the operation */
831         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832 
833         /* set the matrix */
834         loTriFactor->csrMat = new CsrMatrix;
835         loTriFactor->csrMat->num_rows = A->rmap->n;
836         loTriFactor->csrMat->num_cols = A->cmap->n;
837         loTriFactor->csrMat->num_entries = a->nz;
838 
839         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847 
848         /* Create the solve analysis information */
849         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
850         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858       #endif
859 
860         /* perform the solve analysis */
861         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867                                 #endif
868 );CHKERRCUSPARSE(stat);
869         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
871 
872         /* assign the pointer */
873         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
874 
875         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
876         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878       } else {
879         /* Fill the upper triangular matrix */
880         offset = 0;
881         for (i=0; i<n; i++) {
882           /* set the pointers */
883           v  = aa + ai[i];
884           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
885 
886           /* first, set the diagonal elements */
887           AAUp[offset] = 1.0/v[nz];
888           AALo[offset] = 1.0/v[nz];
889 
890           offset+=1;
891           if (nz>0) {
892             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
893             for (j=offset; j<offset+nz; j++) {
894               AAUp[j] = -AAUp[j];
895               AALo[j] = AAUp[j]/v[nz];
896             }
897             offset+=nz;
898           }
899         }
900         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
905       }
906       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908     } catch(char *ex) {
909       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910     }
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916 {
917   PetscErrorCode               ierr;
918   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920   IS                           ip = a->row;
921   PetscBool                    perm_identity;
922   PetscInt                     n = A->rmap->n;
923 
924   PetscFunctionBegin;
925   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
927   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
929 
930   A->offloadmask = PETSC_OFFLOAD_BOTH;
931 
932   /* lower triangular indices */
933   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
934   if (!perm_identity) {
935     IS             iip;
936     const PetscInt *irip,*rip;
937 
938     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
939     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
940     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
941     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
946     ierr = ISDestroy(&iip);CHKERRQ(ierr);
947     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
948     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
949   }
950   PetscFunctionReturn(0);
951 }
952 
953 #define CHECK_LAUNCH_ERROR()                                                             \
954 do {                                                                                     \
955   /* Check synchronous errors, i.e. pre-launch */                                        \
956   cudaError_t err = cudaGetLastError();                                                  \
957   if (cudaSuccess != err) {                                                              \
958     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959   }                                                                                      \
960   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
961   err = cudaDeviceSynchronize();                                                         \
962   if (cudaSuccess != err) {                                                              \
963     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964   }                                                                                      \
965  } while (0)
966 
967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968 {
969   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970   IS             ip = b->row;
971   PetscBool      perm_identity;
972   PetscErrorCode ierr;
973 
974   PetscFunctionBegin;
975   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977   B->offloadmask = PETSC_OFFLOAD_CPU;
978   /* determine which version of MatSolve needs to be used. */
979   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980   if (perm_identity) {
981     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->matsolve = NULL;
984     B->ops->matsolvetranspose = NULL;
985   } else {
986     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988     B->ops->matsolve = NULL;
989     B->ops->matsolvetranspose = NULL;
990   }
991 
992   /* get the triangular factors */
993   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998 {
999   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004   cusparseStatus_t                  stat;
1005   cusparseIndexBase_t               indexBase;
1006   cusparseMatrixType_t              matrixType;
1007   cusparseFillMode_t                fillMode;
1008   cusparseDiagType_t                diagType;
1009   cudaError_t                       cerr;
1010   PetscErrorCode                    ierr;
1011 
1012   PetscFunctionBegin;
1013   /* allocate space for the transpose of the lower triangular factor */
1014   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016 
1017   /* set the matrix descriptors of the lower triangular factor */
1018   matrixType = cusparseGetMatType(loTriFactor->descr);
1019   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023 
1024   /* Create the matrix description */
1025   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030 
1031   /* set the operation */
1032   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033 
1034   /* allocate GPU space for the CSC of the lower triangular factor*/
1035   loTriFactorT->csrMat = new CsrMatrix;
1036   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042 
1043   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                        loTriFactor->csrMat->values->data().get(),
1048                                        loTriFactor->csrMat->row_offsets->data().get(),
1049                                        loTriFactor->csrMat->column_indices->data().get(),
1050                                        loTriFactorT->csrMat->values->data().get(),
1051                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055 #endif
1056 
1057   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060                           loTriFactor->csrMat->values->data().get(),
1061                           loTriFactor->csrMat->row_offsets->data().get(),
1062                           loTriFactor->csrMat->column_indices->data().get(),
1063                           loTriFactorT->csrMat->values->data().get(),
1064                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068                         #else
1069                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070                           CUSPARSE_ACTION_NUMERIC, indexBase
1071                         #endif
1072 );CHKERRCUSPARSE(stat);
1073   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075 
1076   /* Create the solve analysis information */
1077   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086 #endif
1087 
1088   /* perform the solve analysis */
1089   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095                           #endif
1096 );CHKERRCUSPARSE(stat);
1097   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099 
1100   /* assign the pointer */
1101   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102 
1103   /*********************************************/
1104   /* Now the Transpose of the Upper Tri Factor */
1105   /*********************************************/
1106 
1107   /* allocate space for the transpose of the upper triangular factor */
1108   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110 
1111   /* set the matrix descriptors of the upper triangular factor */
1112   matrixType = cusparseGetMatType(upTriFactor->descr);
1113   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117 
1118   /* Create the matrix description */
1119   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124 
1125   /* set the operation */
1126   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127 
1128   /* allocate GPU space for the CSC of the upper triangular factor*/
1129   upTriFactorT->csrMat = new CsrMatrix;
1130   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136 
1137   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                 upTriFactor->csrMat->values->data().get(),
1142                                 upTriFactor->csrMat->row_offsets->data().get(),
1143                                 upTriFactor->csrMat->column_indices->data().get(),
1144                                 upTriFactorT->csrMat->values->data().get(),
1145                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149 #endif
1150 
1151   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154                           upTriFactor->csrMat->values->data().get(),
1155                           upTriFactor->csrMat->row_offsets->data().get(),
1156                           upTriFactor->csrMat->column_indices->data().get(),
1157                           upTriFactorT->csrMat->values->data().get(),
1158                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162                         #else
1163                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                           CUSPARSE_ACTION_NUMERIC, indexBase
1165                         #endif
1166 );CHKERRCUSPARSE(stat);
1167   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169 
1170   /* Create the solve analysis information */
1171   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180   #endif
1181 
1182   /* perform the solve analysis */
1183   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189                           #endif
1190 );CHKERRCUSPARSE(stat);
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 struct PetscScalarToPetscInt
1200 {
1201   __host__ __device__
1202   PetscInt operator()(PetscScalar s)
1203   {
1204     return (PetscInt)PetscRealPart(s);
1205   }
1206 };
1207 
1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209 {
1210   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213   cusparseStatus_t             stat;
1214   cusparseIndexBase_t          indexBase;
1215   cudaError_t                  err;
1216   PetscErrorCode               ierr;
1217 
1218   PetscFunctionBegin;
1219   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1220   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1221   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225   if (A->transupdated) PetscFunctionReturn(0);
1226   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1227   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229   }
1230   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236 
1237     /* set alpha and beta */
1238     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244 
1245     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246       CsrMatrix *matrixT = new CsrMatrix;
1247       matstructT->mat = matrixT;
1248       matrixT->num_rows = A->cmap->n;
1249       matrixT->num_cols = A->rmap->n;
1250       matrixT->num_entries = a->nz;
1251       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253       matrixT->values = new THRUSTARRAY(a->nz);
1254 
1255       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257 
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       stat = cusparseCreateCsr(&matstructT->matDescr,
1260                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262                                matrixT->values->data().get(),
1263                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265      #endif
1266     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269    #else
1270       CsrMatrix *temp  = new CsrMatrix;
1271       CsrMatrix *tempT = new CsrMatrix;
1272       /* First convert HYB to CSR */
1273       temp->num_rows = A->rmap->n;
1274       temp->num_cols = A->cmap->n;
1275       temp->num_entries = a->nz;
1276       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278       temp->values = new THRUSTARRAY(a->nz);
1279 
1280       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282                               temp->values->data().get(),
1283                               temp->row_offsets->data().get(),
1284                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1285 
1286       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287       tempT->num_rows = A->rmap->n;
1288       tempT->num_cols = A->cmap->n;
1289       tempT->num_entries = a->nz;
1290       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292       tempT->values = new THRUSTARRAY(a->nz);
1293 
1294       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295                               temp->num_cols, temp->num_entries,
1296                               temp->values->data().get(),
1297                               temp->row_offsets->data().get(),
1298                               temp->column_indices->data().get(),
1299                               tempT->values->data().get(),
1300                               tempT->column_indices->data().get(),
1301                               tempT->row_offsets->data().get(),
1302                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1303 
1304       /* Last, convert CSC to HYB */
1305       cusparseHybMat_t hybMat;
1306       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310                               matstructT->descr, tempT->values->data().get(),
1311                               tempT->row_offsets->data().get(),
1312                               tempT->column_indices->data().get(),
1313                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1314 
1315       /* assign the pointer */
1316       matstructT->mat = hybMat;
1317       A->transupdated = PETSC_TRUE;
1318       /* delete temporaries */
1319       if (tempT) {
1320         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323         delete (CsrMatrix*) tempT;
1324       }
1325       if (temp) {
1326         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329         delete (CsrMatrix*) temp;
1330       }
1331      #endif
1332     }
1333   }
1334   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1349     }
1350     if (!cusparsestruct->csr2csc_i) {
1351       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1353 
1354       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356       void   *csr2cscBuffer;
1357       size_t csr2cscBufferSize;
1358       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359                                            A->cmap->n, matrix->num_entries,
1360                                            matrix->values->data().get(),
1361                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362                                            matrix->column_indices->data().get(),
1363                                            matrixT->values->data().get(),
1364                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368      #endif
1369 
1370       if (matrix->num_entries) {
1371         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374 
1375            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376            should be filled with indexBase. So I just take a shortcut here.
1377         */
1378         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1379                               A->cmap->n,matrix->num_entries,
1380                               csr2csc_a.data().get(),
1381                               cusparsestruct->rowoffsets_gpu->data().get(),
1382                               matrix->column_indices->data().get(),
1383                               matrixT->values->data().get(),
1384                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386                               CUSPARSE_ACTION_NUMERIC,indexBase,
1387                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388                              #else
1389                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1390                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391                              #endif
1392       } else {
1393         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1394       }
1395 
1396       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400      #endif
1401     }
1402     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404                                                      matrixT->values->begin()));
1405   }
1406   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1407   /* the compressed row indices is not used for matTranspose */
1408   matstructT->cprowIndices = NULL;
1409   /* assign the pointer */
1410   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1411   A->transupdated = PETSC_TRUE;
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1416 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417 {
1418   PetscInt                              n = xx->map->n;
1419   const PetscScalar                     *barray;
1420   PetscScalar                           *xarray;
1421   thrust::device_ptr<const PetscScalar> bGPU;
1422   thrust::device_ptr<PetscScalar>       xGPU;
1423   cusparseStatus_t                      stat;
1424   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428   PetscErrorCode                        ierr;
1429   cudaError_t                           cerr;
1430 
1431   PetscFunctionBegin;
1432   /* Analyze the matrix and create the transpose ... on the fly */
1433   if (!loTriFactorT && !upTriFactorT) {
1434     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1435     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437   }
1438 
1439   /* Get the GPU pointers */
1440   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1441   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1442   xGPU = thrust::device_pointer_cast(xarray);
1443   bGPU = thrust::device_pointer_cast(barray);
1444 
1445   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1446   /* First, reorder with the row permutation */
1447   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449                xGPU);
1450 
1451   /* First, solve U */
1452   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453                         upTriFactorT->csrMat->num_rows,
1454                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455                         upTriFactorT->csrMat->num_entries,
1456                       #endif
1457                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458                         upTriFactorT->csrMat->values->data().get(),
1459                         upTriFactorT->csrMat->row_offsets->data().get(),
1460                         upTriFactorT->csrMat->column_indices->data().get(),
1461                         upTriFactorT->solveInfo,
1462                         xarray, tempGPU->data().get()
1463                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465                       #endif
1466 );CHKERRCUSPARSE(stat);
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(), xarray
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482                       #endif
1483 );CHKERRCUSPARSE(stat);
1484 
1485   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488                tempGPU->begin());
1489 
1490   /* Copy the temporary to the full solution. */
1491   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1492 
1493   /* restore */
1494   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1495   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1496   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1498   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1499   PetscFunctionReturn(0);
1500 }
1501 
1502 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503 {
1504   const PetscScalar                 *barray;
1505   PetscScalar                       *xarray;
1506   cusparseStatus_t                  stat;
1507   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511   PetscErrorCode                    ierr;
1512   cudaError_t                       cerr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray, tempGPU->data().get()
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541                       #endif
1542 );CHKERRCUSPARSE(stat);
1543 
1544   /* Then, solve L */
1545   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546                         loTriFactorT->csrMat->num_rows,
1547                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548                         loTriFactorT->csrMat->num_entries,
1549                       #endif
1550                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551                         loTriFactorT->csrMat->values->data().get(),
1552                         loTriFactorT->csrMat->row_offsets->data().get(),
1553                         loTriFactorT->csrMat->column_indices->data().get(),
1554                         loTriFactorT->solveInfo,
1555                         tempGPU->data().get(), xarray
1556                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558                       #endif
1559 );CHKERRCUSPARSE(stat);
1560 
1561   /* restore */
1562   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1563   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1564   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1566   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1571 {
1572   const PetscScalar                     *barray;
1573   PetscScalar                           *xarray;
1574   thrust::device_ptr<const PetscScalar> bGPU;
1575   thrust::device_ptr<PetscScalar>       xGPU;
1576   cusparseStatus_t                      stat;
1577   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581   PetscErrorCode                        ierr;
1582   cudaError_t                           cerr;
1583 
1584   PetscFunctionBegin;
1585 
1586   /* Get the GPU pointers */
1587   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1588   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1589   xGPU = thrust::device_pointer_cast(xarray);
1590   bGPU = thrust::device_pointer_cast(barray);
1591 
1592   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1593   /* First, reorder with the row permutation */
1594   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1596                tempGPU->begin());
1597 
1598   /* Next, solve L */
1599   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600                         loTriFactor->csrMat->num_rows,
1601                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602                         loTriFactor->csrMat->num_entries,
1603                       #endif
1604                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605                         loTriFactor->csrMat->values->data().get(),
1606                         loTriFactor->csrMat->row_offsets->data().get(),
1607                         loTriFactor->csrMat->column_indices->data().get(),
1608                         loTriFactor->solveInfo,
1609                         tempGPU->data().get(), xarray
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612                       #endif
1613 );CHKERRCUSPARSE(stat);
1614 
1615   /* Then, solve U */
1616   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617                         upTriFactor->csrMat->num_rows,
1618                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619                         upTriFactor->csrMat->num_entries,
1620                       #endif
1621                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622                         upTriFactor->csrMat->values->data().get(),
1623                         upTriFactor->csrMat->row_offsets->data().get(),
1624                         upTriFactor->csrMat->column_indices->data().get(),
1625                         upTriFactor->solveInfo,
1626                         xarray, tempGPU->data().get()
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629                       #endif
1630 );CHKERRCUSPARSE(stat);
1631 
1632   /* Last, reorder with the column permutation */
1633   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1634                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1635                xGPU);
1636 
1637   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1638   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1639   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1641   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1646 {
1647   const PetscScalar                 *barray;
1648   PetscScalar                       *xarray;
1649   cusparseStatus_t                  stat;
1650   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654   PetscErrorCode                    ierr;
1655   cudaError_t                       cerr;
1656 
1657   PetscFunctionBegin;
1658   /* Get the GPU pointers */
1659   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1660   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1661 
1662   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1663   /* First, solve L */
1664   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665                         loTriFactor->csrMat->num_rows,
1666                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667                         loTriFactor->csrMat->num_entries,
1668                       #endif
1669                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670                         loTriFactor->csrMat->values->data().get(),
1671                         loTriFactor->csrMat->row_offsets->data().get(),
1672                         loTriFactor->csrMat->column_indices->data().get(),
1673                         loTriFactor->solveInfo,
1674                         barray, tempGPU->data().get()
1675                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677                       #endif
1678 );CHKERRCUSPARSE(stat);
1679 
1680   /* Next, solve U */
1681   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682                         upTriFactor->csrMat->num_rows,
1683                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684                         upTriFactor->csrMat->num_entries,
1685                       #endif
1686                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687                         upTriFactor->csrMat->values->data().get(),
1688                         upTriFactor->csrMat->row_offsets->data().get(),
1689                         upTriFactor->csrMat->column_indices->data().get(),
1690                         upTriFactor->solveInfo,
1691                         tempGPU->data().get(), xarray
1692                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694                       #endif
1695 );CHKERRCUSPARSE(stat);
1696 
1697   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1698   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1699   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1701   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1706 {
1707   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1708   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1709   cudaError_t        cerr;
1710   PetscErrorCode     ierr;
1711 
1712   PetscFunctionBegin;
1713   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1714     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1715 
1716     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1717     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1718     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1719     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1720     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1721     A->offloadmask = PETSC_OFFLOAD_BOTH;
1722   }
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1727 {
1728   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1729   PetscErrorCode ierr;
1730 
1731   PetscFunctionBegin;
1732   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1733   *array = a->a;
1734   A->offloadmask = PETSC_OFFLOAD_CPU;
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1739 {
1740   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1741   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1742   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1743   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1744   PetscErrorCode               ierr;
1745   cusparseStatus_t             stat;
1746   PetscBool                    both = PETSC_TRUE;
1747   cudaError_t                  err;
1748 
1749   PetscFunctionBegin;
1750   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753       CsrMatrix *matrix;
1754       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1755 
1756       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1757       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1758       matrix->values->assign(a->a, a->a+a->nz);
1759       err  = WaitForCUDA();CHKERRCUDA(err);
1760       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1761       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1762       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1763     } else {
1764       PetscInt nnz;
1765       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1766       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1767       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1768       delete cusparsestruct->workVector;
1769       delete cusparsestruct->rowoffsets_gpu;
1770       cusparsestruct->workVector = NULL;
1771       cusparsestruct->rowoffsets_gpu = NULL;
1772       try {
1773         if (a->compressedrow.use) {
1774           m    = a->compressedrow.nrows;
1775           ii   = a->compressedrow.i;
1776           ridx = a->compressedrow.rindex;
1777         } else {
1778           m    = A->rmap->n;
1779           ii   = a->i;
1780           ridx = NULL;
1781         }
1782         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785         else nnz = a->nz;
1786 
1787         /* create cusparse matrix */
1788         cusparsestruct->nrows = m;
1789         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1790         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1791         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1792         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1793 
1794         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1795         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1796         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1798         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1799         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1800         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1801 
1802         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804           /* set the matrix */
1805           CsrMatrix *mat= new CsrMatrix;
1806           mat->num_rows = m;
1807           mat->num_cols = A->cmap->n;
1808           mat->num_entries = nnz;
1809           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810           mat->row_offsets->assign(ii, ii + m+1);
1811 
1812           mat->column_indices = new THRUSTINTARRAY32(nnz);
1813           mat->column_indices->assign(a->j, a->j+nnz);
1814 
1815           mat->values = new THRUSTARRAY(nnz);
1816           if (a->a) mat->values->assign(a->a, a->a+nnz);
1817 
1818           /* assign the pointer */
1819           matstruct->mat = mat;
1820          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822             stat = cusparseCreateCsr(&matstruct->matDescr,
1823                                     mat->num_rows, mat->num_cols, mat->num_entries,
1824                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825                                     mat->values->data().get(),
1826                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828           }
1829          #endif
1830         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833          #else
1834           CsrMatrix *mat= new CsrMatrix;
1835           mat->num_rows = m;
1836           mat->num_cols = A->cmap->n;
1837           mat->num_entries = nnz;
1838           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839           mat->row_offsets->assign(ii, ii + m+1);
1840 
1841           mat->column_indices = new THRUSTINTARRAY32(nnz);
1842           mat->column_indices->assign(a->j, a->j+nnz);
1843 
1844           mat->values = new THRUSTARRAY(nnz);
1845           if (a->a) mat->values->assign(a->a, a->a+nnz);
1846 
1847           cusparseHybMat_t hybMat;
1848           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852               matstruct->descr, mat->values->data().get(),
1853               mat->row_offsets->data().get(),
1854               mat->column_indices->data().get(),
1855               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856           /* assign the pointer */
1857           matstruct->mat = hybMat;
1858 
1859           if (mat) {
1860             if (mat->values) delete (THRUSTARRAY*)mat->values;
1861             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863             delete (CsrMatrix*)mat;
1864           }
1865          #endif
1866         }
1867 
1868         /* assign the compressed row indices */
1869         if (a->compressedrow.use) {
1870           cusparsestruct->workVector = new THRUSTARRAY(m);
1871           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1872           matstruct->cprowIndices->assign(ridx,ridx+m);
1873           tmp = m;
1874         } else {
1875           cusparsestruct->workVector = NULL;
1876           matstruct->cprowIndices    = NULL;
1877           tmp = 0;
1878         }
1879         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1880 
1881         /* assign the pointer */
1882         cusparsestruct->mat = matstruct;
1883       } catch(char *ex) {
1884         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1885       }
1886       err  = WaitForCUDA();CHKERRCUDA(err);
1887       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888       cusparsestruct->nonzerostate = A->nonzerostate;
1889     }
1890     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1891   }
1892   PetscFunctionReturn(0);
1893 }
1894 
1895 struct VecCUDAPlusEquals
1896 {
1897   template <typename Tuple>
1898   __host__ __device__
1899   void operator()(Tuple t)
1900   {
1901     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902   }
1903 };
1904 
1905 struct VecCUDAEquals
1906 {
1907   template <typename Tuple>
1908   __host__ __device__
1909   void operator()(Tuple t)
1910   {
1911     thrust::get<1>(t) = thrust::get<0>(t);
1912   }
1913 };
1914 
1915 struct VecCUDAEqualsReverse
1916 {
1917   template <typename Tuple>
1918   __host__ __device__
1919   void operator()(Tuple t)
1920   {
1921     thrust::get<0>(t) = thrust::get<1>(t);
1922   }
1923 };
1924 
1925 struct MatMatCusparse {
1926   PetscBool             cisdense;
1927   PetscScalar           *Bt;
1928   Mat                   X;
1929   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930   PetscLogDouble        flops;
1931   CsrMatrix             *Bcsr;
1932 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933   cusparseSpMatDescr_t  matSpBDescr;
1934   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1935   cusparseDnMatDescr_t  matBDescr;
1936   cusparseDnMatDescr_t  matCDescr;
1937   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938   size_t                mmBufferSize;
1939   void                  *mmBuffer;
1940   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941   cusparseSpGEMMDescr_t spgemmDesc;
1942 #endif
1943 };
1944 
1945 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946 {
1947   PetscErrorCode   ierr;
1948   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1949   cudaError_t      cerr;
1950  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951   cusparseStatus_t stat;
1952  #endif
1953 
1954   PetscFunctionBegin;
1955   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956   delete mmdata->Bcsr;
1957  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964  #endif
1965   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1966   ierr = PetscFree(data);CHKERRQ(ierr);
1967   PetscFunctionReturn(0);
1968 }
1969 
1970 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1971 
1972 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973 {
1974   Mat_Product                  *product = C->product;
1975   Mat                          A,B;
1976   PetscInt                     m,n,blda,clda;
1977   PetscBool                    flg,biscuda;
1978   Mat_SeqAIJCUSPARSE           *cusp;
1979   cusparseStatus_t             stat;
1980   cusparseOperation_t          opA;
1981   const PetscScalar            *barray;
1982   PetscScalar                  *carray;
1983   PetscErrorCode               ierr;
1984   MatMatCusparse               *mmdata;
1985   Mat_SeqAIJCUSPARSEMultStruct *mat;
1986   CsrMatrix                    *csrmat;
1987   cudaError_t                  cerr;
1988 
1989   PetscFunctionBegin;
1990   MatCheckProduct(C,1);
1991   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992   mmdata = (MatMatCusparse*)product->data;
1993   A    = product->A;
1994   B    = product->B;
1995   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1996   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998      Instead of silently accepting the wrong answer, I prefer to raise the error */
1999   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2001   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002   switch (product->type) {
2003   case MATPRODUCT_AB:
2004   case MATPRODUCT_PtAP:
2005     mat = cusp->mat;
2006     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007     m   = A->rmap->n;
2008     n   = B->cmap->n;
2009     break;
2010   case MATPRODUCT_AtB:
2011     if (!A->form_explicit_transpose) {
2012       mat = cusp->mat;
2013       opA = CUSPARSE_OPERATION_TRANSPOSE;
2014     } else {
2015       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2016       mat  = cusp->matTranspose;
2017       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018     }
2019     m = A->cmap->n;
2020     n = B->cmap->n;
2021     break;
2022   case MATPRODUCT_ABt:
2023   case MATPRODUCT_RARt:
2024     mat = cusp->mat;
2025     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026     m   = A->rmap->n;
2027     n   = B->rmap->n;
2028     break;
2029   default:
2030     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031   }
2032   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033   csrmat = (CsrMatrix*)mat->mat;
2034   /* if the user passed a CPU matrix, copy the data to the GPU */
2035   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2036   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2037   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2038 
2039   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2040   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2042     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2043   } else {
2044     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2045     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2046   }
2047 
2048   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2049  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053     size_t mmBufferSize;
2054     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055     if (!mmdata->matBDescr) {
2056       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057       mmdata->Blda = blda;
2058     }
2059 
2060     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063       mmdata->Clda = clda;
2064     }
2065 
2066     if (!mat->matDescr) {
2067       stat = cusparseCreateCsr(&mat->matDescr,
2068                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070                                csrmat->values->data().get(),
2071                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073     }
2074     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076                                    mmdata->matCDescr,cusparse_scalartype,
2077                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081       mmdata->mmBufferSize = mmBufferSize;
2082     }
2083     mmdata->initialized = PETSC_TRUE;
2084   } else {
2085     /* to be safe, always update pointers of the mats */
2086     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089   }
2090 
2091   /* do cusparseSpMM, which supports transpose on B */
2092   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094                       mmdata->matCDescr,cusparse_scalartype,
2095                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096  #else
2097   PetscInt k;
2098   /* cusparseXcsrmm does not support transpose on B */
2099   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100     cublasHandle_t cublasv2handle;
2101     cublasStatus_t cerr;
2102 
2103     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2104     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105                        B->cmap->n,B->rmap->n,
2106                        &PETSC_CUSPARSE_ONE ,barray,blda,
2107                        &PETSC_CUSPARSE_ZERO,barray,blda,
2108                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109     blda = B->cmap->n;
2110     k    = B->cmap->n;
2111   } else {
2112     k    = B->rmap->n;
2113   }
2114 
2115   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117                            csrmat->num_entries,mat->alpha_one,mat->descr,
2118                            csrmat->values->data().get(),
2119                            csrmat->row_offsets->data().get(),
2120                            csrmat->column_indices->data().get(),
2121                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122                            carray,clda);CHKERRCUSPARSE(stat);
2123  #endif
2124   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2126   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2127   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2128   if (product->type == MATPRODUCT_RARt) {
2129     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2130     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2131   } else if (product->type == MATPRODUCT_PtAP) {
2132     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2133     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2134   } else {
2135     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2136   }
2137   if (mmdata->cisdense) {
2138     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2139   }
2140   if (!biscuda) {
2141     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2142   }
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147 {
2148   Mat_Product        *product = C->product;
2149   Mat                A,B;
2150   PetscInt           m,n;
2151   PetscBool          cisdense,flg;
2152   PetscErrorCode     ierr;
2153   MatMatCusparse     *mmdata;
2154   Mat_SeqAIJCUSPARSE *cusp;
2155 
2156   PetscFunctionBegin;
2157   MatCheckProduct(C,1);
2158   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159   A    = product->A;
2160   B    = product->B;
2161   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2162   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165   switch (product->type) {
2166   case MATPRODUCT_AB:
2167     m = A->rmap->n;
2168     n = B->cmap->n;
2169     break;
2170   case MATPRODUCT_AtB:
2171     m = A->cmap->n;
2172     n = B->cmap->n;
2173     break;
2174   case MATPRODUCT_ABt:
2175     m = A->rmap->n;
2176     n = B->rmap->n;
2177     break;
2178   case MATPRODUCT_PtAP:
2179     m = B->cmap->n;
2180     n = B->cmap->n;
2181     break;
2182   case MATPRODUCT_RARt:
2183     m = B->rmap->n;
2184     n = B->rmap->n;
2185     break;
2186   default:
2187     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188   }
2189   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2190   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2192   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2193 
2194   /* product data */
2195   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2196   mmdata->cisdense = cisdense;
2197  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201   }
2202  #endif
2203   /* for these products we need intermediate storage */
2204   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2206     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2207     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2209     } else {
2210       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2211     }
2212   }
2213   C->product->data    = mmdata;
2214   C->product->destroy = MatDestroy_MatMatCusparse;
2215 
2216   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217   PetscFunctionReturn(0);
2218 }
2219 
2220 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221 {
2222   Mat_Product                  *product = C->product;
2223   Mat                          A,B;
2224   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2225   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2226   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2228   PetscBool                    flg;
2229   PetscErrorCode               ierr;
2230   cusparseStatus_t             stat;
2231   cudaError_t                  cerr;
2232   MatProductType               ptype;
2233   MatMatCusparse               *mmdata;
2234 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235   cusparseSpMatDescr_t         BmatSpDescr;
2236 #endif
2237 
2238   PetscFunctionBegin;
2239   MatCheckProduct(C,1);
2240   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2242   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243   mmdata = (MatMatCusparse*)C->product->data;
2244   A = product->A;
2245   B = product->B;
2246   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247     mmdata->reusesym = PETSC_FALSE;
2248     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250     Cmat = Ccusp->mat;
2251     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252     Ccsr = (CsrMatrix*)Cmat->mat;
2253     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254     goto finalize;
2255   }
2256   if (!c->nz) goto finalize;
2257   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2258   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2260   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2270   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2271 
2272   ptype = product->type;
2273   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275   switch (ptype) {
2276   case MATPRODUCT_AB:
2277     Amat = Acusp->mat;
2278     Bmat = Bcusp->mat;
2279     break;
2280   case MATPRODUCT_AtB:
2281     Amat = Acusp->matTranspose;
2282     Bmat = Bcusp->mat;
2283     break;
2284   case MATPRODUCT_ABt:
2285     Amat = Acusp->mat;
2286     Bmat = Bcusp->matTranspose;
2287     break;
2288   default:
2289     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290   }
2291   Cmat = Ccusp->mat;
2292   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295   Acsr = (CsrMatrix*)Amat->mat;
2296   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297   Ccsr = (CsrMatrix*)Cmat->mat;
2298   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2302 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311 #else
2312   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317 #endif
2318   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2319   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2321   C->offloadmask = PETSC_OFFLOAD_GPU;
2322 finalize:
2323   /* shorter version of MatAssemblyEnd_SeqAIJ */
2324   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2325   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2326   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2327   c->reallocs         = 0;
2328   C->info.mallocs    += 0;
2329   C->info.nz_unneeded = 0;
2330   C->assembled = C->was_assembled = PETSC_TRUE;
2331   C->num_ass++;
2332   PetscFunctionReturn(0);
2333 }
2334 
2335 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336 {
2337   Mat_Product                  *product = C->product;
2338   Mat                          A,B;
2339   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2340   Mat_SeqAIJ                   *a,*b,*c;
2341   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2343   PetscInt                     i,j,m,n,k;
2344   PetscBool                    flg;
2345   PetscErrorCode               ierr;
2346   cusparseStatus_t             stat;
2347   cudaError_t                  cerr;
2348   MatProductType               ptype;
2349   MatMatCusparse               *mmdata;
2350   PetscLogDouble               flops;
2351   PetscBool                    biscompressed,ciscompressed;
2352 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2354   size_t                       bufSize2;
2355   cusparseSpMatDescr_t         BmatSpDescr;
2356 #else
2357   int                          cnz;
2358 #endif
2359 
2360   PetscFunctionBegin;
2361   MatCheckProduct(C,1);
2362   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363   A    = product->A;
2364   B    = product->B;
2365   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2366   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2368   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369   a = (Mat_SeqAIJ*)A->data;
2370   b = (Mat_SeqAIJ*)B->data;
2371   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2375 
2376   /* product data */
2377   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2378   C->product->data    = mmdata;
2379   C->product->destroy = MatDestroy_MatMatCusparse;
2380 
2381   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2382   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2383   ptype = product->type;
2384   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386   biscompressed = PETSC_FALSE;
2387   ciscompressed = PETSC_FALSE;
2388   switch (ptype) {
2389   case MATPRODUCT_AB:
2390     m = A->rmap->n;
2391     n = B->cmap->n;
2392     k = A->cmap->n;
2393     Amat = Acusp->mat;
2394     Bmat = Bcusp->mat;
2395     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397     break;
2398   case MATPRODUCT_AtB:
2399     m = A->cmap->n;
2400     n = B->cmap->n;
2401     k = A->rmap->n;
2402     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2403     Amat = Acusp->matTranspose;
2404     Bmat = Bcusp->mat;
2405     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406     break;
2407   case MATPRODUCT_ABt:
2408     m = A->rmap->n;
2409     n = B->rmap->n;
2410     k = A->cmap->n;
2411     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2412     Amat = Acusp->mat;
2413     Bmat = Bcusp->matTranspose;
2414     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415     break;
2416   default:
2417     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418   }
2419 
2420   /* create cusparse matrix */
2421   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2422   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2423   c     = (Mat_SeqAIJ*)C->data;
2424   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2426   Ccsr  = new CsrMatrix;
2427 
2428   c->compressedrow.use = ciscompressed;
2429   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430     c->compressedrow.nrows = a->compressedrow.nrows;
2431     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2432     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2433     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2434     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436   } else {
2437     c->compressedrow.nrows  = 0;
2438     c->compressedrow.i      = NULL;
2439     c->compressedrow.rindex = NULL;
2440     Ccusp->workVector       = NULL;
2441     Cmat->cprowIndices      = NULL;
2442   }
2443   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2444   Ccusp->mat      = Cmat;
2445   Ccusp->mat->mat = Ccsr;
2446   Ccsr->num_rows    = Ccusp->nrows;
2447   Ccsr->num_cols    = n;
2448   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460     c->nz = 0;
2461     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462     Ccsr->values = new THRUSTARRAY(c->nz);
2463     goto finalizesym;
2464   }
2465 
2466   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468   Acsr = (CsrMatrix*)Amat->mat;
2469   if (!biscompressed) {
2470     Bcsr = (CsrMatrix*)Bmat->mat;
2471 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472     BmatSpDescr = Bmat->matDescr;
2473 #endif
2474   } else { /* we need to use row offsets for the full matrix */
2475     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476     Bcsr = new CsrMatrix;
2477     Bcsr->num_rows       = B->rmap->n;
2478     Bcsr->num_cols       = cBcsr->num_cols;
2479     Bcsr->num_entries    = cBcsr->num_entries;
2480     Bcsr->column_indices = cBcsr->column_indices;
2481     Bcsr->values         = cBcsr->values;
2482     if (!Bcusp->rowoffsets_gpu) {
2483       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2484       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2486     }
2487     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488     mmdata->Bcsr = Bcsr;
2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490     if (Bcsr->num_rows && Bcsr->num_cols) {
2491       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493                                Bcsr->values->data().get(),
2494                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496     }
2497     BmatSpDescr = mmdata->matSpBDescr;
2498 #endif
2499   }
2500   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502   /* precompute flops count */
2503   if (ptype == MATPRODUCT_AB) {
2504     for (i=0, flops = 0; i<A->rmap->n; i++) {
2505       const PetscInt st = a->i[i];
2506       const PetscInt en = a->i[i+1];
2507       for (j=st; j<en; j++) {
2508         const PetscInt brow = a->j[j];
2509         flops += 2.*(b->i[brow+1] - b->i[brow]);
2510       }
2511     }
2512   } else if (ptype == MATPRODUCT_AtB) {
2513     for (i=0, flops = 0; i<A->rmap->n; i++) {
2514       const PetscInt anzi = a->i[i+1] - a->i[i];
2515       const PetscInt bnzi = b->i[i+1] - b->i[i];
2516       flops += (2.*anzi)*bnzi;
2517     }
2518   } else { /* TODO */
2519     flops = 0.;
2520   }
2521 
2522   mmdata->flops = flops;
2523   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527                            NULL, NULL, NULL,
2528                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531   /* ask bufferSize bytes for external memory */
2532   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537   /* inspect the matrices A and B to understand the memory requirement for the next step */
2538   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542   /* ask bufferSize again bytes for external memory */
2543   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547   /* The CUSPARSE documentation is not clear, nor the API
2548      We need both buffers to perform the operations properly!
2549      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551      is stored in the descriptor! What a messy API... */
2552   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553   /* compute the intermediate product of A * B */
2554   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558   /* get matrix C non-zero entries C_nnz1 */
2559   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560   c->nz = (PetscInt) C_nnz1;
2561   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2562   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564   Ccsr->values = new THRUSTARRAY(c->nz);
2565   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571 #else
2572   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578   c->nz = cnz;
2579   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581   Ccsr->values = new THRUSTARRAY(c->nz);
2582   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2583 
2584   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593 #endif
2594   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2596   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2597 finalizesym:
2598   c->singlemalloc = PETSC_FALSE;
2599   c->free_a       = PETSC_TRUE;
2600   c->free_ij      = PETSC_TRUE;
2601   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2602   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2603   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604     PetscInt *d_i = c->i;
2605     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607     ii   = *Ccsr->row_offsets;
2608     jj   = *Ccsr->column_indices;
2609     if (ciscompressed) d_i = c->compressedrow.i;
2610     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612   } else {
2613     PetscInt *d_i = c->i;
2614     if (ciscompressed) d_i = c->compressedrow.i;
2615     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617   }
2618   if (ciscompressed) { /* need to expand host row offsets */
2619     PetscInt r = 0;
2620     c->i[0] = 0;
2621     for (k = 0; k < c->compressedrow.nrows; k++) {
2622       const PetscInt next = c->compressedrow.rindex[k];
2623       const PetscInt old = c->compressedrow.i[k];
2624       for (; r < next; r++) c->i[r+1] = old;
2625     }
2626     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627   }
2628   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2629   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2630   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2631   c->maxnz = c->nz;
2632   c->nonzerorowcnt = 0;
2633   c->rmax = 0;
2634   for (k = 0; k < m; k++) {
2635     const PetscInt nn = c->i[k+1] - c->i[k];
2636     c->ilen[k] = c->imax[k] = nn;
2637     c->nonzerorowcnt += (PetscInt)!!nn;
2638     c->rmax = PetscMax(c->rmax,nn);
2639   }
2640   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2641   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2642   Ccsr->num_entries = c->nz;
2643 
2644   C->nonzerostate++;
2645   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2646   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2647   Ccusp->nonzerostate = C->nonzerostate;
2648   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2649   C->preallocated  = PETSC_TRUE;
2650   C->assembled     = PETSC_FALSE;
2651   C->was_assembled = PETSC_FALSE;
2652   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653     mmdata->reusesym = PETSC_TRUE;
2654     C->offloadmask   = PETSC_OFFLOAD_GPU;
2655   }
2656   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657   PetscFunctionReturn(0);
2658 }
2659 
2660 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2661 
2662 /* handles sparse or dense B */
2663 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664 {
2665   Mat_Product    *product = mat->product;
2666   PetscErrorCode ierr;
2667   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2668 
2669   PetscFunctionBegin;
2670   MatCheckProduct(mat,1);
2671   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2672   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2674   }
2675   if (product->type == MATPRODUCT_ABC) {
2676     Ciscusp = PETSC_FALSE;
2677     if (!product->C->boundtocpu) {
2678       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2679     }
2680   }
2681   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2682     PetscBool usecpu = PETSC_FALSE;
2683     switch (product->type) {
2684     case MATPRODUCT_AB:
2685       if (product->api_user) {
2686         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2687         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2688         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2689       } else {
2690         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2691         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2692         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2693       }
2694       break;
2695     case MATPRODUCT_AtB:
2696       if (product->api_user) {
2697         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2698         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2699         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2700       } else {
2701         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2702         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2703         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2704       }
2705       break;
2706     case MATPRODUCT_PtAP:
2707       if (product->api_user) {
2708         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2709         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2710         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2711       } else {
2712         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2713         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2714         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2715       }
2716       break;
2717     case MATPRODUCT_RARt:
2718       if (product->api_user) {
2719         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2720         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2721         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2722       } else {
2723         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2724         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2725         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2726       }
2727       break;
2728     case MATPRODUCT_ABC:
2729       if (product->api_user) {
2730         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2731         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2732         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2733       } else {
2734         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2735         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2736         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2737       }
2738       break;
2739     default:
2740       break;
2741     }
2742     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2743   }
2744   /* dispatch */
2745   if (isdense) {
2746     switch (product->type) {
2747     case MATPRODUCT_AB:
2748     case MATPRODUCT_AtB:
2749     case MATPRODUCT_ABt:
2750     case MATPRODUCT_PtAP:
2751     case MATPRODUCT_RARt:
2752      if (product->A->boundtocpu) {
2753         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2754       } else {
2755         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2756       }
2757       break;
2758     case MATPRODUCT_ABC:
2759       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2760       break;
2761     default:
2762       break;
2763     }
2764   } else if (Biscusp && Ciscusp) {
2765     switch (product->type) {
2766     case MATPRODUCT_AB:
2767     case MATPRODUCT_AtB:
2768     case MATPRODUCT_ABt:
2769       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2770       break;
2771     case MATPRODUCT_PtAP:
2772     case MATPRODUCT_RARt:
2773     case MATPRODUCT_ABC:
2774       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2775       break;
2776     default:
2777       break;
2778     }
2779   } else { /* fallback for AIJ */
2780     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2781   }
2782   PetscFunctionReturn(0);
2783 }
2784 
2785 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2786 {
2787   PetscErrorCode ierr;
2788 
2789   PetscFunctionBegin;
2790   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2791   PetscFunctionReturn(0);
2792 }
2793 
2794 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2795 {
2796   PetscErrorCode ierr;
2797 
2798   PetscFunctionBegin;
2799   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2800   PetscFunctionReturn(0);
2801 }
2802 
2803 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2804 {
2805   PetscErrorCode ierr;
2806 
2807   PetscFunctionBegin;
2808   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2809   PetscFunctionReturn(0);
2810 }
2811 
2812 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2813 {
2814   PetscErrorCode ierr;
2815 
2816   PetscFunctionBegin;
2817   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2818   PetscFunctionReturn(0);
2819 }
2820 
2821 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2822 {
2823   PetscErrorCode ierr;
2824 
2825   PetscFunctionBegin;
2826   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2831 {
2832   int i = blockIdx.x*blockDim.x + threadIdx.x;
2833   if (i < n) y[idx[i]] += x[i];
2834 }
2835 
2836 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2837 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2838 {
2839   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2840   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2841   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2842   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2843   PetscErrorCode               ierr;
2844   cudaError_t                  cerr;
2845   cusparseStatus_t             stat;
2846   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2847   PetscBool                    compressed;
2848 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849   PetscInt                     nx,ny;
2850 #endif
2851 
2852   PetscFunctionBegin;
2853   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2854   if (!a->nonzerorowcnt) {
2855     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2856     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2857     PetscFunctionReturn(0);
2858   }
2859   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2860   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2861   if (!trans) {
2862     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2863     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2864   } else {
2865     if (herm || !A->form_explicit_transpose) {
2866       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2867       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2868     } else {
2869       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2870       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2871     }
2872   }
2873   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2874   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2875 
2876   try {
2877     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2878     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2879     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2880 
2881     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2882     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2883       /* z = A x + beta y.
2884          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2885          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2886       */
2887       xptr = xarray;
2888       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2889       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2890      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2891       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2892           allocated to accommodate different uses. So we get the length info directly from mat.
2893        */
2894       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2895         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2896         nx = mat->num_cols;
2897         ny = mat->num_rows;
2898       }
2899      #endif
2900     } else {
2901       /* z = A^T x + beta y
2902          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2903          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2904        */
2905       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2906       dptr = zarray;
2907       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2908       if (compressed) { /* Scatter x to work vector */
2909         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2910         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2911                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2912                          VecCUDAEqualsReverse());
2913       }
2914      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2915       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2916         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2917         nx = mat->num_rows;
2918         ny = mat->num_cols;
2919       }
2920      #endif
2921     }
2922 
2923     /* csr_spmv does y = alpha op(A) x + beta y */
2924     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2925      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2926       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2927       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2928         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2929         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2930         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2931                                 matstruct->matDescr,
2932                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2933                                 matstruct->cuSpMV[opA].vecYDescr,
2934                                 cusparse_scalartype,
2935                                 cusparsestruct->spmvAlg,
2936                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2937         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2938 
2939         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2940       } else {
2941         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2942         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2943         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2944       }
2945 
2946       stat = cusparseSpMV(cusparsestruct->handle, opA,
2947                                matstruct->alpha_one,
2948                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2949                                matstruct->cuSpMV[opA].vecXDescr,
2950                                beta,
2951                                matstruct->cuSpMV[opA].vecYDescr,
2952                                cusparse_scalartype,
2953                                cusparsestruct->spmvAlg,
2954                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2955      #else
2956       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2957       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2958                                mat->num_rows, mat->num_cols,
2959                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2960                                mat->values->data().get(), mat->row_offsets->data().get(),
2961                                mat->column_indices->data().get(), xptr, beta,
2962                                dptr);CHKERRCUSPARSE(stat);
2963      #endif
2964     } else {
2965       if (cusparsestruct->nrows) {
2966        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2967         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2968        #else
2969         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2970         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2971                                  matstruct->alpha_one, matstruct->descr, hybMat,
2972                                  xptr, beta,
2973                                  dptr);CHKERRCUSPARSE(stat);
2974        #endif
2975       }
2976     }
2977     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2978     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2979 
2980     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2981       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2982         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2983           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2984         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2985           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2986         }
2987       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2988         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2989       }
2990 
2991       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2992       if (compressed) {
2993         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2994         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2995            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2996            prevent that. So I just add a ScatterAdd kernel.
2997          */
2998        #if 0
2999         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3000         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3001                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3002                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3003                          VecCUDAPlusEquals());
3004        #else
3005         PetscInt n = matstruct->cprowIndices->size();
3006         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3007        #endif
3008         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3009         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3010       }
3011     } else {
3012       if (yy && yy != zz) {
3013         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3014       }
3015     }
3016     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3017     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3018     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3019   } catch(char *ex) {
3020     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3021   }
3022   if (yy) {
3023     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3024   } else {
3025     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3026   }
3027   PetscFunctionReturn(0);
3028 }
3029 
3030 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3031 {
3032   PetscErrorCode ierr;
3033 
3034   PetscFunctionBegin;
3035   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3036   PetscFunctionReturn(0);
3037 }
3038 
3039 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3040 {
3041   PetscErrorCode              ierr;
3042   PetscSplitCSRDataStructure  *d_mat = NULL;
3043   PetscFunctionBegin;
3044   if (A->factortype == MAT_FACTOR_NONE) {
3045     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3046   }
3047   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
3048   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
3049   if (d_mat) {
3050     A->offloadmask = PETSC_OFFLOAD_GPU;
3051   }
3052 
3053   PetscFunctionReturn(0);
3054 }
3055 
3056 /* --------------------------------------------------------------------------------*/
3057 /*@
3058    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3059    (the default parallel PETSc format). This matrix will ultimately pushed down
3060    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3061    assembly performance the user should preallocate the matrix storage by setting
3062    the parameter nz (or the array nnz).  By setting these parameters accurately,
3063    performance during matrix assembly can be increased by more than a factor of 50.
3064 
3065    Collective
3066 
3067    Input Parameters:
3068 +  comm - MPI communicator, set to PETSC_COMM_SELF
3069 .  m - number of rows
3070 .  n - number of columns
3071 .  nz - number of nonzeros per row (same for all rows)
3072 -  nnz - array containing the number of nonzeros in the various rows
3073          (possibly different for each row) or NULL
3074 
3075    Output Parameter:
3076 .  A - the matrix
3077 
3078    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3079    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3080    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3081 
3082    Notes:
3083    If nnz is given then nz is ignored
3084 
3085    The AIJ format (also called the Yale sparse matrix format or
3086    compressed row storage), is fully compatible with standard Fortran 77
3087    storage.  That is, the stored row and column indices can begin at
3088    either one (as in Fortran) or zero.  See the users' manual for details.
3089 
3090    Specify the preallocated storage with either nz or nnz (not both).
3091    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3092    allocation.  For large problems you MUST preallocate memory or you
3093    will get TERRIBLE performance, see the users' manual chapter on matrices.
3094 
3095    By default, this format uses inodes (identical nodes) when possible, to
3096    improve numerical efficiency of matrix-vector products and solves. We
3097    search for consecutive rows with the same nonzero structure, thereby
3098    reusing matrix information to achieve increased efficiency.
3099 
3100    Level: intermediate
3101 
3102 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3103 @*/
3104 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3105 {
3106   PetscErrorCode ierr;
3107 
3108   PetscFunctionBegin;
3109   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3110   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3111   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3112   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3113   PetscFunctionReturn(0);
3114 }
3115 
3116 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3117 {
3118   PetscErrorCode              ierr;
3119   PetscSplitCSRDataStructure  *d_mat = NULL;
3120 
3121   PetscFunctionBegin;
3122   if (A->factortype == MAT_FACTOR_NONE) {
3123     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3124     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3125     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3126   } else {
3127     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3128   }
3129   if (d_mat) {
3130     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3131     cudaError_t                err;
3132     PetscSplitCSRDataStructure h_mat;
3133     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3134     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3135     if (a->compressedrow.use) {
3136       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3137     }
3138     err = cudaFree(d_mat);CHKERRCUDA(err);
3139   }
3140   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3141   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3142   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3143   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3144   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3145   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3146   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3147   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3148   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3149   PetscFunctionReturn(0);
3150 }
3151 
3152 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3153 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3154 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3155 {
3156   PetscErrorCode ierr;
3157 
3158   PetscFunctionBegin;
3159   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3160   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3161   PetscFunctionReturn(0);
3162 }
3163 
3164 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3165 {
3166   PetscErrorCode     ierr;
3167   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3168   Mat_SeqAIJCUSPARSE *cy;
3169   Mat_SeqAIJCUSPARSE *cx;
3170   PetscScalar        *ay;
3171   const PetscScalar  *ax;
3172   CsrMatrix          *csry,*csrx;
3173   cudaError_t        cerr;
3174 
3175   PetscFunctionBegin;
3176   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3177   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3178   if (X->ops->axpy != Y->ops->axpy) {
3179     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3180     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3181     PetscFunctionReturn(0);
3182   }
3183   /* if we are here, it means both matrices are bound to GPU */
3184   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3185   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3186   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3187   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3188   csry = (CsrMatrix*)cy->mat->mat;
3189   csrx = (CsrMatrix*)cx->mat->mat;
3190   /* see if we can turn this into a cublas axpy */
3191   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3192     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3193     if (eq) {
3194       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3195     }
3196     if (eq) str = SAME_NONZERO_PATTERN;
3197   }
3198   /* spgeam is buggy with one column */
3199   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3200 
3201   if (str == SUBSET_NONZERO_PATTERN) {
3202     cusparseStatus_t stat;
3203     PetscScalar      b = 1.0;
3204 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3205     size_t           bufferSize;
3206     void             *buffer;
3207 #endif
3208 
3209     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3210     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3211     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3213     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3214                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3215                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3216                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3217     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3218     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3219     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3220                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3221                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3222                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3223     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3224     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3225     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3226     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3227 #else
3228     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3229     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3230                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3231                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3232                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3233     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3234     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3235     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3236 #endif
3237     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3238     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3239     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3240     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3241   } else if (str == SAME_NONZERO_PATTERN) {
3242     cublasHandle_t cublasv2handle;
3243     cublasStatus_t berr;
3244     PetscBLASInt   one = 1, bnz = 1;
3245 
3246     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3247     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3248     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3249     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3250     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3251     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3252     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3253     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3254     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3255     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3256     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3257     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3258   } else {
3259     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3260     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3261   }
3262   PetscFunctionReturn(0);
3263 }
3264 
3265 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3266 {
3267   PetscErrorCode ierr;
3268   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3269   PetscScalar    *ay;
3270   cudaError_t    cerr;
3271   cublasHandle_t cublasv2handle;
3272   cublasStatus_t berr;
3273   PetscBLASInt   one = 1, bnz = 1;
3274 
3275   PetscFunctionBegin;
3276   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3277   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3278   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3279   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3280   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3281   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3282   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3283   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3284   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3285   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3286   PetscFunctionReturn(0);
3287 }
3288 
3289 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3290 {
3291   PetscErrorCode             ierr;
3292   PetscBool                  both = PETSC_FALSE;
3293   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3294 
3295   PetscFunctionBegin;
3296   if (A->factortype == MAT_FACTOR_NONE) {
3297     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3298     if (spptr->mat) {
3299       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3300       if (matrix->values) {
3301         both = PETSC_TRUE;
3302         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3303       }
3304     }
3305     if (spptr->matTranspose) {
3306       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3307       if (matrix->values) {
3308         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3309       }
3310     }
3311   }
3312   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3313   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3314   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3315   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3316   else A->offloadmask = PETSC_OFFLOAD_CPU;
3317 
3318   PetscFunctionReturn(0);
3319 }
3320 
3321 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3322 {
3323   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3324   PetscErrorCode ierr;
3325 
3326   PetscFunctionBegin;
3327   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3328   if (flg) {
3329     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3330 
3331     A->ops->scale                     = MatScale_SeqAIJ;
3332     A->ops->axpy                      = MatAXPY_SeqAIJ;
3333     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3334     A->ops->mult                      = MatMult_SeqAIJ;
3335     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3336     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3337     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3338     A->ops->multhermitiantranspose    = NULL;
3339     A->ops->multhermitiantransposeadd = NULL;
3340     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3341     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3342     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3343     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3344     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3345     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3346     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3347     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3348   } else {
3349     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3350     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3351     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3352     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3353     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3354     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3355     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3356     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3357     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3358     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3359     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3360     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3361     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3362     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3363     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3364     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3365     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3366   }
3367   A->boundtocpu = flg;
3368   a->inode.use = flg;
3369   PetscFunctionReturn(0);
3370 }
3371 
3372 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3373 {
3374   PetscErrorCode   ierr;
3375   cusparseStatus_t stat;
3376   Mat              B;
3377 
3378   PetscFunctionBegin;
3379   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3380   if (reuse == MAT_INITIAL_MATRIX) {
3381     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3382   } else if (reuse == MAT_REUSE_MATRIX) {
3383     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3384   }
3385   B = *newmat;
3386 
3387   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3388   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3389 
3390   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3391     if (B->factortype == MAT_FACTOR_NONE) {
3392       Mat_SeqAIJCUSPARSE *spptr;
3393       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3394       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3395       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3396       spptr->format     = MAT_CUSPARSE_CSR;
3397      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3398       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3399       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3400       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3401      #endif
3402       B->spptr = spptr;
3403     } else {
3404       Mat_SeqAIJCUSPARSETriFactors *spptr;
3405 
3406       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3407       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3408       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3409       B->spptr = spptr;
3410     }
3411     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3412   }
3413   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3414   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3415   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3416   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3417   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3418   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3419 
3420   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3421   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3422   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3423   PetscFunctionReturn(0);
3424 }
3425 
3426 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3427 {
3428   PetscErrorCode ierr;
3429 
3430   PetscFunctionBegin;
3431   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3432   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3433   PetscFunctionReturn(0);
3434 }
3435 
3436 /*MC
3437    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3438 
3439    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3440    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3441    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3442 
3443    Options Database Keys:
3444 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3445 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3446 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3447 
3448   Level: beginner
3449 
3450 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3451 M*/
3452 
3453 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3454 
3455 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3456 {
3457   PetscErrorCode ierr;
3458 
3459   PetscFunctionBegin;
3460   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3461   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3462   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3463   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3464   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3465 
3466   PetscFunctionReturn(0);
3467 }
3468 
3469 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3470 {
3471   PetscErrorCode   ierr;
3472   cusparseStatus_t stat;
3473 
3474   PetscFunctionBegin;
3475   if (*cusparsestruct) {
3476     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3477     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3478     delete (*cusparsestruct)->workVector;
3479     delete (*cusparsestruct)->rowoffsets_gpu;
3480     delete (*cusparsestruct)->cooPerm;
3481     delete (*cusparsestruct)->cooPerm_a;
3482     delete (*cusparsestruct)->csr2csc_i;
3483     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3484     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3485   }
3486   PetscFunctionReturn(0);
3487 }
3488 
3489 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3490 {
3491   PetscFunctionBegin;
3492   if (*mat) {
3493     delete (*mat)->values;
3494     delete (*mat)->column_indices;
3495     delete (*mat)->row_offsets;
3496     delete *mat;
3497     *mat = 0;
3498   }
3499   PetscFunctionReturn(0);
3500 }
3501 
3502 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3503 {
3504   cusparseStatus_t stat;
3505   PetscErrorCode   ierr;
3506 
3507   PetscFunctionBegin;
3508   if (*trifactor) {
3509     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3510     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3511     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3512     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3513     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3514    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3515     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3516    #endif
3517     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3518   }
3519   PetscFunctionReturn(0);
3520 }
3521 
3522 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3523 {
3524   CsrMatrix        *mat;
3525   cusparseStatus_t stat;
3526   cudaError_t      err;
3527 
3528   PetscFunctionBegin;
3529   if (*matstruct) {
3530     if ((*matstruct)->mat) {
3531       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3532        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3533         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3534        #else
3535         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3536         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3537        #endif
3538       } else {
3539         mat = (CsrMatrix*)(*matstruct)->mat;
3540         CsrMatrix_Destroy(&mat);
3541       }
3542     }
3543     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3544     delete (*matstruct)->cprowIndices;
3545     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3546     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3547     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3548 
3549    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3550     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3551     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3552     for (int i=0; i<3; i++) {
3553       if (mdata->cuSpMV[i].initialized) {
3554         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3555         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3556         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3557       }
3558     }
3559    #endif
3560     delete *matstruct;
3561     *matstruct = NULL;
3562   }
3563   PetscFunctionReturn(0);
3564 }
3565 
3566 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3567 {
3568   PetscErrorCode ierr;
3569 
3570   PetscFunctionBegin;
3571   if (*trifactors) {
3572     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3573     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3574     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3575     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3576     delete (*trifactors)->rpermIndices;
3577     delete (*trifactors)->cpermIndices;
3578     delete (*trifactors)->workVector;
3579     (*trifactors)->rpermIndices = NULL;
3580     (*trifactors)->cpermIndices = NULL;
3581     (*trifactors)->workVector = NULL;
3582     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3583     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3584   }
3585   PetscFunctionReturn(0);
3586 }
3587 
3588 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3589 {
3590   PetscErrorCode   ierr;
3591   cusparseHandle_t handle;
3592   cusparseStatus_t stat;
3593 
3594   PetscFunctionBegin;
3595   if (*trifactors) {
3596     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3597     if (handle = (*trifactors)->handle) {
3598       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3599     }
3600     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3601   }
3602   PetscFunctionReturn(0);
3603 }
3604 
3605 struct IJCompare
3606 {
3607   __host__ __device__
3608   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3609   {
3610     if (t1.get<0>() < t2.get<0>()) return true;
3611     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3612     return false;
3613   }
3614 };
3615 
3616 struct IJEqual
3617 {
3618   __host__ __device__
3619   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3620   {
3621     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3622     return true;
3623   }
3624 };
3625 
3626 struct IJDiff
3627 {
3628   __host__ __device__
3629   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3630   {
3631     return t1 == t2 ? 0 : 1;
3632   }
3633 };
3634 
3635 struct IJSum
3636 {
3637   __host__ __device__
3638   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3639   {
3640     return t1||t2;
3641   }
3642 };
3643 
3644 #include <thrust/iterator/discard_iterator.h>
3645 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3646 {
3647   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3648   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3649   THRUSTARRAY                           *cooPerm_v = NULL;
3650   thrust::device_ptr<const PetscScalar> d_v;
3651   CsrMatrix                             *matrix;
3652   PetscErrorCode                        ierr;
3653   cudaError_t                           cerr;
3654   PetscInt                              n;
3655 
3656   PetscFunctionBegin;
3657   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3658   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3659   if (!cusp->cooPerm) {
3660     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3661     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3662     PetscFunctionReturn(0);
3663   }
3664   matrix = (CsrMatrix*)cusp->mat->mat;
3665   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3666   if (!v) {
3667     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3668     goto finalize;
3669   }
3670   n = cusp->cooPerm->size();
3671   if (isCudaMem(v)) {
3672     d_v = thrust::device_pointer_cast(v);
3673   } else {
3674     cooPerm_v = new THRUSTARRAY(n);
3675     cooPerm_v->assign(v,v+n);
3676     d_v = cooPerm_v->data();
3677     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3678   }
3679   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3680   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3681     if (cusp->cooPerm_a) {
3682       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3683       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3684       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3685       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3686       delete cooPerm_w;
3687     } else {
3688       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3689                                                                 matrix->values->begin()));
3690       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3691                                                                 matrix->values->end()));
3692       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3693     }
3694   } else {
3695     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3696       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3697       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3698     } else {
3699       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3700                                                                 matrix->values->begin()));
3701       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3702                                                                 matrix->values->end()));
3703       thrust::for_each(zibit,zieit,VecCUDAEquals());
3704     }
3705   }
3706   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3707   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3708 finalize:
3709   delete cooPerm_v;
3710   A->offloadmask = PETSC_OFFLOAD_GPU;
3711   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3712   /* shorter version of MatAssemblyEnd_SeqAIJ */
3713   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3714   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3715   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3716   a->reallocs         = 0;
3717   A->info.mallocs    += 0;
3718   A->info.nz_unneeded = 0;
3719   A->assembled = A->was_assembled = PETSC_TRUE;
3720   A->num_ass++;
3721   PetscFunctionReturn(0);
3722 }
3723 
3724 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3725 {
3726   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3727   PetscErrorCode     ierr;
3728 
3729   PetscFunctionBegin;
3730   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3731   if (!cusp) PetscFunctionReturn(0);
3732   if (destroy) {
3733     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3734     delete cusp->csr2csc_i;
3735     cusp->csr2csc_i = NULL;
3736   }
3737   A->transupdated = PETSC_FALSE;
3738   PetscFunctionReturn(0);
3739 }
3740 
3741 #include <thrust/binary_search.h>
3742 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3743 {
3744   PetscErrorCode     ierr;
3745   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3746   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3747   PetscInt           cooPerm_n, nzr = 0;
3748   cudaError_t        cerr;
3749 
3750   PetscFunctionBegin;
3751   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3752   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3753   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3754   if (n != cooPerm_n) {
3755     delete cusp->cooPerm;
3756     delete cusp->cooPerm_a;
3757     cusp->cooPerm = NULL;
3758     cusp->cooPerm_a = NULL;
3759   }
3760   if (n) {
3761     THRUSTINTARRAY d_i(n);
3762     THRUSTINTARRAY d_j(n);
3763     THRUSTINTARRAY ii(A->rmap->n);
3764 
3765     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3766     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3767 
3768     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3769     d_i.assign(coo_i,coo_i+n);
3770     d_j.assign(coo_j,coo_j+n);
3771     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3772     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3773 
3774     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3775     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3776     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3777     *cusp->cooPerm_a = d_i;
3778     THRUSTINTARRAY w = d_j;
3779 
3780     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3781     if (nekey == ekey) { /* all entries are unique */
3782       delete cusp->cooPerm_a;
3783       cusp->cooPerm_a = NULL;
3784     } else { /* I couldn't come up with a more elegant algorithm */
3785       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3786       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3787       (*cusp->cooPerm_a)[0] = 0;
3788       w[0] = 0;
3789       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3790       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3791     }
3792     thrust::counting_iterator<PetscInt> search_begin(0);
3793     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3794                         search_begin, search_begin + A->rmap->n,
3795                         ii.begin());
3796     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3797     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3798 
3799     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3800     a->singlemalloc = PETSC_FALSE;
3801     a->free_a       = PETSC_TRUE;
3802     a->free_ij      = PETSC_TRUE;
3803     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3804     a->i[0] = 0;
3805     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3806     a->nz = a->maxnz = a->i[A->rmap->n];
3807     a->rmax = 0;
3808     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3809     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3810     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3811     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3812     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3813     for (PetscInt i = 0; i < A->rmap->n; i++) {
3814       const PetscInt nnzr = a->i[i+1] - a->i[i];
3815       nzr += (PetscInt)!!(nnzr);
3816       a->ilen[i] = a->imax[i] = nnzr;
3817       a->rmax = PetscMax(a->rmax,nnzr);
3818     }
3819     a->nonzerorowcnt = nzr;
3820     A->preallocated = PETSC_TRUE;
3821     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3822     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3823   } else {
3824     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3825   }
3826   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3827 
3828   /* We want to allocate the CUSPARSE struct for matvec now.
3829      The code is so convoluted now that I prefer to copy zeros */
3830   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3831   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3832   A->offloadmask = PETSC_OFFLOAD_CPU;
3833   A->nonzerostate++;
3834   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3835   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3836 
3837   A->assembled = PETSC_FALSE;
3838   A->was_assembled = PETSC_FALSE;
3839   PetscFunctionReturn(0);
3840 }
3841 
3842 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3843 {
3844   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845   CsrMatrix          *csr;
3846   PetscErrorCode     ierr;
3847 
3848   PetscFunctionBegin;
3849   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3850   PetscValidPointer(a,2);
3851   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3852   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3853   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3854   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3855   csr = (CsrMatrix*)cusp->mat->mat;
3856   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3857   *a = csr->values->data().get();
3858   PetscFunctionReturn(0);
3859 }
3860 
3861 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3862 {
3863   PetscFunctionBegin;
3864   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3865   PetscValidPointer(a,2);
3866   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3867   *a = NULL;
3868   PetscFunctionReturn(0);
3869 }
3870 
3871 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3872 {
3873   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874   CsrMatrix          *csr;
3875   PetscErrorCode     ierr;
3876 
3877   PetscFunctionBegin;
3878   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3879   PetscValidPointer(a,2);
3880   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3881   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3882   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3883   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3884   csr = (CsrMatrix*)cusp->mat->mat;
3885   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3886   *a = csr->values->data().get();
3887   A->offloadmask = PETSC_OFFLOAD_GPU;
3888   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3889   PetscFunctionReturn(0);
3890 }
3891 
3892 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3893 {
3894   PetscErrorCode ierr;
3895 
3896   PetscFunctionBegin;
3897   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3898   PetscValidPointer(a,2);
3899   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3900   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3901   *a = NULL;
3902   PetscFunctionReturn(0);
3903 }
3904 
3905 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3906 {
3907   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3908   CsrMatrix          *csr;
3909   PetscErrorCode     ierr;
3910 
3911   PetscFunctionBegin;
3912   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3913   PetscValidPointer(a,2);
3914   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3915   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3916   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3917   csr = (CsrMatrix*)cusp->mat->mat;
3918   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3919   *a = csr->values->data().get();
3920   A->offloadmask = PETSC_OFFLOAD_GPU;
3921   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3922   PetscFunctionReturn(0);
3923 }
3924 
3925 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3926 {
3927   PetscErrorCode ierr;
3928 
3929   PetscFunctionBegin;
3930   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3931   PetscValidPointer(a,2);
3932   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3933   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3934   *a = NULL;
3935   PetscFunctionReturn(0);
3936 }
3937 
3938 struct IJCompare4
3939 {
3940   __host__ __device__
3941   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3942   {
3943     if (t1.get<0>() < t2.get<0>()) return true;
3944     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3945     return false;
3946   }
3947 };
3948 
3949 struct Shift
3950 {
3951   int _shift;
3952 
3953   Shift(int shift) : _shift(shift) {}
3954   __host__ __device__
3955   inline int operator() (const int &c)
3956   {
3957     return c + _shift;
3958   }
3959 };
3960 
3961 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3962 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3963 {
3964   PetscErrorCode               ierr;
3965   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3966   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3967   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3968   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3969   PetscInt                     Annz,Bnnz;
3970   cusparseStatus_t             stat;
3971   PetscInt                     i,m,n,zero = 0;
3972   cudaError_t                  cerr;
3973 
3974   PetscFunctionBegin;
3975   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3976   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3977   PetscValidPointer(C,4);
3978   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3979   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3980   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3981   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3982   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3983   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3984   if (reuse == MAT_INITIAL_MATRIX) {
3985     m     = A->rmap->n;
3986     n     = A->cmap->n + B->cmap->n;
3987     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3988     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3989     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3990     c     = (Mat_SeqAIJ*)(*C)->data;
3991     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3992     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3993     Ccsr  = new CsrMatrix;
3994     Cmat->cprowIndices      = NULL;
3995     c->compressedrow.use    = PETSC_FALSE;
3996     c->compressedrow.nrows  = 0;
3997     c->compressedrow.i      = NULL;
3998     c->compressedrow.rindex = NULL;
3999     Ccusp->workVector       = NULL;
4000     Ccusp->nrows    = m;
4001     Ccusp->mat      = Cmat;
4002     Ccusp->mat->mat = Ccsr;
4003     Ccsr->num_rows  = m;
4004     Ccsr->num_cols  = n;
4005     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4006     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4007     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4008     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4009     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4010     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4011     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4012     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4013     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4014     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4015     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4016     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
4017     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
4018     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4019     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4020 
4021     Acsr = (CsrMatrix*)Acusp->mat->mat;
4022     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4023     Annz = (PetscInt)Acsr->column_indices->size();
4024     Bnnz = (PetscInt)Bcsr->column_indices->size();
4025     c->nz = Annz + Bnnz;
4026     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4027     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4028     Ccsr->values = new THRUSTARRAY(c->nz);
4029     Ccsr->num_entries = c->nz;
4030     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4031     if (c->nz) {
4032       auto Acoo = new THRUSTINTARRAY32(Annz);
4033       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4034       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4035       THRUSTINTARRAY32 *Aroff,*Broff;
4036 
4037       if (a->compressedrow.use) { /* need full row offset */
4038         if (!Acusp->rowoffsets_gpu) {
4039           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4040           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4041           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4042         }
4043         Aroff = Acusp->rowoffsets_gpu;
4044       } else Aroff = Acsr->row_offsets;
4045       if (b->compressedrow.use) { /* need full row offset */
4046         if (!Bcusp->rowoffsets_gpu) {
4047           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4048           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4049           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4050         }
4051         Broff = Bcusp->rowoffsets_gpu;
4052       } else Broff = Bcsr->row_offsets;
4053       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4054       stat = cusparseXcsr2coo(Acusp->handle,
4055                               Aroff->data().get(),
4056                               Annz,
4057                               m,
4058                               Acoo->data().get(),
4059                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4060       stat = cusparseXcsr2coo(Bcusp->handle,
4061                               Broff->data().get(),
4062                               Bnnz,
4063                               m,
4064                               Bcoo->data().get(),
4065                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4066       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4067       auto Aperm = thrust::make_constant_iterator(1);
4068       auto Bperm = thrust::make_constant_iterator(0);
4069 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4070       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4071       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4072 #else
4073       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4074       auto Bcib = Bcsr->column_indices->begin();
4075       auto Bcie = Bcsr->column_indices->end();
4076       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4077 #endif
4078       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4079       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4080       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4081       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4082       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4083       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4084       auto p1 = Ccusp->cooPerm->begin();
4085       auto p2 = Ccusp->cooPerm->begin();
4086       thrust::advance(p2,Annz);
4087       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4088 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4089       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4090 #endif
4091       auto cci = thrust::make_counting_iterator(zero);
4092       auto cce = thrust::make_counting_iterator(c->nz);
4093 #if 0 //Errors on SUMMIT cuda 11.1.0
4094       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4095 #else
4096       auto pred = thrust::identity<int>();
4097       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4098       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4099 #endif
4100       stat = cusparseXcoo2csr(Ccusp->handle,
4101                               Ccoo->data().get(),
4102                               c->nz,
4103                               m,
4104                               Ccsr->row_offsets->data().get(),
4105                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4106       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4107       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4108       delete wPerm;
4109       delete Acoo;
4110       delete Bcoo;
4111       delete Ccoo;
4112 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4113       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4114                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4115                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4116                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4117 #endif
4118       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4119         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4120         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4121         CsrMatrix *CcsrT = new CsrMatrix;
4122         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4123         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4124 
4125         (*C)->form_explicit_transpose = PETSC_TRUE;
4126         (*C)->transupdated = PETSC_TRUE;
4127         Ccusp->rowoffsets_gpu = NULL;
4128         CmatT->cprowIndices = NULL;
4129         CmatT->mat = CcsrT;
4130         CcsrT->num_rows = n;
4131         CcsrT->num_cols = m;
4132         CcsrT->num_entries = c->nz;
4133 
4134         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4135         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4136         CcsrT->values = new THRUSTARRAY(c->nz);
4137 
4138         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4139         auto rT = CcsrT->row_offsets->begin();
4140         if (AT) {
4141           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4142           thrust::advance(rT,-1);
4143         }
4144         if (BT) {
4145           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4146           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4147           thrust::copy(titb,tite,rT);
4148         }
4149         auto cT = CcsrT->column_indices->begin();
4150         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4151         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4152         auto vT = CcsrT->values->begin();
4153         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4154         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4155         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4156         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4157 
4158         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4159         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4160         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4161         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4162         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4163         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4164         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4165         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4166         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4167 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4168         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4169                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4170                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4171                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4172 #endif
4173         Ccusp->matTranspose = CmatT;
4174       }
4175     }
4176 
4177     c->singlemalloc = PETSC_FALSE;
4178     c->free_a       = PETSC_TRUE;
4179     c->free_ij      = PETSC_TRUE;
4180     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4181     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4182     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4183       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4184       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4185       ii   = *Ccsr->row_offsets;
4186       jj   = *Ccsr->column_indices;
4187       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4188       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4189     } else {
4190       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4191       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4192     }
4193     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4194     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4195     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4196     c->maxnz = c->nz;
4197     c->nonzerorowcnt = 0;
4198     c->rmax = 0;
4199     for (i = 0; i < m; i++) {
4200       const PetscInt nn = c->i[i+1] - c->i[i];
4201       c->ilen[i] = c->imax[i] = nn;
4202       c->nonzerorowcnt += (PetscInt)!!nn;
4203       c->rmax = PetscMax(c->rmax,nn);
4204     }
4205     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4206     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4207     (*C)->nonzerostate++;
4208     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4209     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4210     Ccusp->nonzerostate = (*C)->nonzerostate;
4211     (*C)->preallocated  = PETSC_TRUE;
4212   } else {
4213     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4214     c = (Mat_SeqAIJ*)(*C)->data;
4215     if (c->nz) {
4216       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4217       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4218       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4219       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4220       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4221       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4222       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4223       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4224       Acsr = (CsrMatrix*)Acusp->mat->mat;
4225       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4226       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4227       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4228       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4229       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4230       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4231       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4232       auto pmid = Ccusp->cooPerm->begin();
4233       thrust::advance(pmid,Acsr->num_entries);
4234       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4235       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4236                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4237       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4238                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4239       thrust::for_each(zibait,zieait,VecCUDAEquals());
4240       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4241                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4242       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4243                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4244       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4245       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4246       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4247         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4248         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4249         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4250         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4251         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4252         auto vT = CcsrT->values->begin();
4253         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4254         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4255         (*C)->transupdated = PETSC_TRUE;
4256       }
4257       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4258       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4259     }
4260   }
4261   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4262   (*C)->assembled     = PETSC_TRUE;
4263   (*C)->was_assembled = PETSC_FALSE;
4264   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4265   PetscFunctionReturn(0);
4266 }
4267 
4268 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4269 {
4270   PetscErrorCode    ierr;
4271   bool              dmem;
4272   const PetscScalar *av;
4273   cudaError_t       cerr;
4274 
4275   PetscFunctionBegin;
4276   dmem = isCudaMem(v);
4277   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4278   if (n && idx) {
4279     THRUSTINTARRAY widx(n);
4280     widx.assign(idx,idx+n);
4281     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4282 
4283     THRUSTARRAY *w = NULL;
4284     thrust::device_ptr<PetscScalar> dv;
4285     if (dmem) {
4286       dv = thrust::device_pointer_cast(v);
4287     } else {
4288       w = new THRUSTARRAY(n);
4289       dv = w->data();
4290     }
4291     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4292 
4293     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4294     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4295     thrust::for_each(zibit,zieit,VecCUDAEquals());
4296     if (w) {
4297       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4298     }
4299     delete w;
4300   } else {
4301     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4302   }
4303   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4304   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4305   PetscFunctionReturn(0);
4306 }
4307 
4308 /*
4309   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4310 
4311   requires:
4312      structurally symmetric: fix with transpose/column meta data
4313 */
4314 
4315 /*
4316   The GPU LU factor kernel
4317 */
4318 __global__
4319 void __launch_bounds__(1024,1)
4320 mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4321 {
4322   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4323   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4324   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4325 
4326   // set i (row+1)
4327   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4328   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4329   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4330     if (rowb < end_i && threadIdx.x==0) {
4331       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4332       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4333     }
4334   }
4335 }
4336 // copy AIJ to AIJ_BAND
4337 __global__
4338 void __launch_bounds__(1024,1)
4339 mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4340                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4341                                 const int bi_csr[], PetscScalar ba_csr[])
4342 {
4343   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4344   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4345   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4346 
4347   // zero B
4348   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4349   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4350     if (rowb < end_i) {
4351       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4352       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4353       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4354         if (j<nzb) {
4355           batmp[j] = 0;
4356         }
4357       }
4358     }
4359   }
4360 
4361   // copy A into B with CSR format -- these two loops can be fused
4362   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4363     if (rowb < end_i) {
4364       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4365       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4366       const PetscScalar *av    = aa_d + ai_d[rowa];
4367       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4368       /* load in initial (unfactored row) */
4369       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4370         if (j<nza) {
4371           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4372           PetscScalar vala = av[j];
4373           batmp[idx] = vala;
4374         }
4375       }
4376     }
4377   }
4378 }
4379 // print AIJ_BAND
4380 __global__
4381 void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4382 {
4383   // debug
4384   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4385     printf("B (AIJ) n=%d:\n",(int)n);
4386     for (int rowb=0;rowb<n;rowb++) {
4387       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4388       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4389       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4390       printf(" bi=%d\n",bi_csr[rowb+1]);
4391     }
4392   }
4393 }
4394 // Band LU kernel ---  ba_csr bi_csr
4395 __global__
4396 void __launch_bounds__(1024,1)
4397 mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4398 {
4399   extern __shared__ PetscInt smemInt[];
4400   PetscInt        *sm_pkIdx  = &smemInt[0];
4401   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4402   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4403   const PetscInt  start = field*nloc, end = start + nloc;
4404 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4405   auto g = cooperative_groups::this_grid();
4406 #endif
4407   // A22 panel update for each row A(1,:) and col A(:,1)
4408   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4409     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4410     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4411     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4412     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4413     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4414     const PetscScalar Bdd = *pBdd;
4415     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4416     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4417       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4418         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4419         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4420         *Aid = *Aid/Bdd;
4421         sm_pkIdx[threadIdx.y] = kIdx;
4422       }
4423       __syncthreads(); // synch on threadIdx.x only
4424       if (idx < nzUd) { /* assuming symmetric structure */
4425         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4426         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4427         PetscScalar *Aij =  Aid + 1;
4428         PetscScalar Lid  = *Aid;
4429         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4430           if (jIdx<nzUd) {
4431             Aij[jIdx] -= Lid*baUd[jIdx];
4432           }
4433         }
4434       }
4435     }
4436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4437     g.sync();
4438 #else
4439     __syncthreads();
4440 #endif
4441   } /* endof for (i=0; i<n; i++) { */
4442 }
4443 
4444 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4445 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4446 {
4447   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4448   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4449   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4450   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4451   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4452   CsrMatrix                    *matrixA;
4453   PetscErrorCode               ierr;
4454   cudaError_t                  cerr;
4455   const PetscInt               n=A->rmap->n, *ic, *r;
4456   const int                    *ai_d, *aj_d;
4457   const PetscScalar            *aa_d;
4458   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4459   int                          *bi_t = cusparseTriFactors->i_band_d;
4460   PetscContainer               container;
4461   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4462 
4463   PetscFunctionBegin;
4464   if (A->rmap->n == 0) {
4465     PetscFunctionReturn(0);
4466   }
4467   // cusparse setup
4468   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4469   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4470   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4471   matrixA = (CsrMatrix*)matstructA->mat;
4472   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4473 
4474   // factor: get Nf if available
4475   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4476   if (container) {
4477     PetscInt *pNf=NULL;
4478     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4479     Nf = (*pNf)%1000;
4480     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4481   } else Nf = 1;
4482   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4483 
4484   // get data
4485   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4486   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4487   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4488   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4489   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4490 
4491   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4492   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4493   {
4494     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4495     int gpuid;
4496     cudaDeviceProp prop;
4497     cudaGetDevice(&gpuid);
4498     cudaGetDeviceProperties(&prop, gpuid);
4499 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4500     Ni = 1/nconcurrent;
4501     Ni = 1;
4502 #else
4503     nsm = prop.multiProcessorCount;
4504     Ni = nsm/Nf/nconcurrent;
4505 #endif
4506     team_size = bw/Ni + !!(bw%Ni);
4507     nVec = PetscMin(bw, 1024/team_size);
4508     ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr);
4509     {
4510       dim3 dimBlockTeam(nVec,team_size);
4511       dim3 dimBlockLeague(Nf,Ni);
4512       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4513       CHECK_LAUNCH_ERROR(); // does a sync
4514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4515       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4516       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4517 #else
4518       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4519 #endif
4520       CHECK_LAUNCH_ERROR(); // does a sync
4521 #if defined(PETSC_USE_LOG)
4522       ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr);
4523 #endif
4524     }
4525   }
4526   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4527 
4528   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4529   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4530   B->ops->solvetranspose = NULL; // need transpose
4531   B->ops->matsolve = NULL;
4532   B->ops->matsolvetranspose = NULL;
4533 
4534   PetscFunctionReturn(0);
4535 }
4536 
4537 static PetscErrorCode MatrixNfDestroy(void *ptr)
4538 {
4539   PetscInt *nf = (PetscInt *)ptr;
4540   PetscErrorCode  ierr;
4541   PetscFunctionBegin;
4542   ierr = PetscFree(nf);CHKERRQ(ierr);
4543   PetscFunctionReturn(0);
4544 }
4545 
4546 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4547 {
4548   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4549   IS                 isicol;
4550   PetscErrorCode     ierr;
4551   cudaError_t        cerr;
4552   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4553   PetscScalar        *ba_t;
4554   int                *bi_t;
4555   PetscInt           i,n=A->rmap->n,Nf;
4556   PetscInt           nzBcsr,bwL,bwU;
4557   PetscBool          missing;
4558   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4559   PetscContainer               container;
4560 
4561   PetscFunctionBegin;
4562   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4563   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
4564   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4565   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4566   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr);
4567   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4568 
4569    // factor: get Nf if available
4570   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4571   if (container) {
4572     PetscInt *pNf=NULL;
4573     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4574     Nf = (*pNf)%1000;
4575     ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr);
4576     ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr);
4577     *pNf = Nf;
4578     ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr);
4579     ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr);
4580     ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr);
4581     ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
4582   } else Nf = 1;
4583   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4584 
4585   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4586   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4587 
4588   ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4589   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr);
4590   b    = (Mat_SeqAIJ*)(B)->data;
4591 
4592   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4593   bwL = bwU = 0;
4594   for (int rwb=0; rwb<n; rwb++) {
4595     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4596     for (int j=0;j<anz;j++) {
4597       PetscInt colb = ic[ajtmp[j]];
4598       if (colb<rwa) { // L
4599         if (rwa-colb > bwL) bwL = rwa-colb;
4600       } else {
4601         if (colb-rwa > bwU) bwU = colb-rwa;
4602       }
4603     }
4604   }
4605   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4606   /* only support structurally symmetric, but it might work */
4607   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4608   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
4609   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4610   b->maxnz = b->nz = nzBcsr;
4611   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4612   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4613   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4614   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4615   cusparseTriFactors->a_band_d = ba_t;
4616   cusparseTriFactors->i_band_d = bi_t;
4617   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4618   ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr);
4619   {
4620     dim3 dimBlockTeam(1,128);
4621     dim3 dimBlockLeague(Nf,1);
4622     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4623   }
4624   CHECK_LAUNCH_ERROR(); // does a sync
4625 
4626   // setup data
4627   if (!cusparseTriFactors->rpermIndices) {
4628     const PetscInt *r;
4629 
4630     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4631     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4632     cusparseTriFactors->rpermIndices->assign(r, r+n);
4633     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4634     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4635   }
4636   /* upper triangular indices */
4637   if (!cusparseTriFactors->cpermIndices) {
4638     const PetscInt *c;
4639 
4640     ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr);
4641     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4642     cusparseTriFactors->cpermIndices->assign(c, c+n);
4643     ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr);
4644     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4645   }
4646 
4647   /* put together the new matrix */
4648   b->free_a       = PETSC_FALSE;
4649   b->free_ij      = PETSC_FALSE;
4650   b->singlemalloc = PETSC_FALSE;
4651   b->ilen = NULL;
4652   b->imax = NULL;
4653   b->row  = isrow;
4654   b->col  = iscol;
4655   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4656   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4657   b->icol = isicol;
4658   ierr    = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr);
4659 
4660   B->factortype            = MAT_FACTOR_LU;
4661   B->info.factor_mallocs   = 0;
4662   B->info.fill_ratio_given = 0;
4663 
4664   if (ai[n]) {
4665     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4666   } else {
4667     B->info.fill_ratio_needed = 0.0;
4668   }
4669 #if defined(PETSC_USE_INFO)
4670   if (ai[n] != 0) {
4671     PetscReal af = B->info.fill_ratio_needed;
4672     ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr);
4673   } else {
4674     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
4675   }
4676 #endif
4677   if (a->inode.size) {
4678     ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr);
4679   }
4680   ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr);
4681   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4682   B->offloadmask = PETSC_OFFLOAD_GPU;
4683 
4684   PetscFunctionReturn(0);
4685 }
4686 
4687 /* Use -pc_factor_mat_solver_type cusparseband */
4688 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4689 {
4690   PetscFunctionBegin;
4691   *type = MATSOLVERCUSPARSEBAND;
4692   PetscFunctionReturn(0);
4693 }
4694 
4695 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4696 {
4697   PetscErrorCode ierr;
4698   PetscInt       n = A->rmap->n;
4699 
4700   PetscFunctionBegin;
4701   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
4702   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
4703   (*B)->factortype = ftype;
4704   (*B)->useordering = PETSC_TRUE;
4705   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4706 
4707   if (ftype == MAT_FACTOR_LU) {
4708     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
4709     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4710     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4711   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4712 
4713   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4714   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr);
4715   PetscFunctionReturn(0);
4716 }
4717 
4718 #define WARP_SIZE 32
4719 template <typename T>
4720 __forceinline__ __device__
4721 T wreduce(T a)
4722 {
4723   T b;
4724   #pragma unroll
4725   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4726     b = __shfl_down_sync(0xffffffff, a, i);
4727     a += b;
4728   }
4729   return a;
4730 }
4731 // reduce in a block, returns result in thread 0
4732 template <typename T, int BLOCK_SIZE>
4733 __device__
4734 T breduce(T a)
4735 {
4736   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4737   __shared__ double buf[NWARP];
4738   int wid = threadIdx.x / WARP_SIZE;
4739   int laneid = threadIdx.x % WARP_SIZE;
4740   T b = wreduce<T>(a);
4741   if (laneid == 0)
4742     buf[wid] = b;
4743   __syncthreads();
4744   if (wid == 0) {
4745     if (threadIdx.x < NWARP)
4746       a = buf[threadIdx.x];
4747     else
4748       a = 0;
4749     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4750       a += __shfl_down_sync(0xffffffff, a, i);
4751     }
4752   }
4753   return a;
4754 }
4755 
4756 
4757 // Band LU kernel ---  ba_csr bi_csr
4758 template <int BLOCK_SIZE>
4759 __global__
4760 void __launch_bounds__(256,1)
4761 mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4762 {
4763   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4764   const PetscScalar *pLi;
4765   const int tid = threadIdx.x;
4766 
4767   /* Next, solve L */
4768   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4769   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4770     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4771     PetscScalar t = 0;
4772     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4773       t += pLi[idx]*x[j];
4774     }
4775 #if defined(PETSC_USE_COMPLEX)
4776     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4777     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4778     t = tt;
4779 #else
4780     t = breduce<PetscReal,BLOCK_SIZE>(t);
4781 #endif
4782     if (threadIdx.x == 0)
4783       x[glbDD] -= t; // /1.0
4784     __syncthreads();
4785     // inc
4786     pLi += glbDD-col; // get to diagonal
4787     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4788     else pLi += bw;
4789     pLi += 1; // skip to next row
4790     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4791   }
4792   /* Then, solve U */
4793   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4794   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4795   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4796     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4797     PetscScalar t = 0;
4798     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4799       t += pLi[-idx]*x[j];
4800     }
4801 #if defined(PETSC_USE_COMPLEX)
4802     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4803     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4804     t = tt;
4805 #else
4806     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4807 #endif
4808     pLi -= col-glbDD; // diagonal
4809     if (threadIdx.x == 0) {
4810       x[glbDD] -= t;
4811       x[glbDD] /= pLi[0];
4812     }
4813     __syncthreads();
4814     // inc past L to start of previous U
4815     pLi -= bw+1;
4816     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4817     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4818   }
4819 }
4820 
4821 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4822 {
4823   const PetscScalar                     *barray;
4824   PetscScalar                           *xarray;
4825   thrust::device_ptr<const PetscScalar> bGPU;
4826   thrust::device_ptr<PetscScalar>       xGPU;
4827   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4828   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4829   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4830   PetscErrorCode                        ierr;
4831   cudaError_t                           cerr;
4832   PetscContainer                        container;
4833 
4834   PetscFunctionBegin;
4835   if (A->rmap->n == 0) {
4836     PetscFunctionReturn(0);
4837   }
4838   // factor: get Nf if available
4839   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4840   if (container) {
4841     PetscInt *pNf=NULL;
4842     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4843     Nf = (*pNf)%1000;
4844   } else Nf = 1;
4845   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4846 
4847   /* Get the GPU pointers */
4848   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
4849   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
4850   xGPU = thrust::device_pointer_cast(xarray);
4851   bGPU = thrust::device_pointer_cast(barray);
4852 
4853   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4854   /* First, reorder with the row permutation */
4855   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4856                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4857                tempGPU->begin());
4858   constexpr int block = 128;
4859   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4860   CHECK_LAUNCH_ERROR(); // does a sync
4861 
4862   /* Last, reorder with the column permutation */
4863   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4864                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4865                xGPU);
4866 
4867   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
4868   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
4869   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4870   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4871   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
4872   PetscFunctionReturn(0);
4873 }
4874