xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ac364067c38fcf400184aa78d587ee09f0c4bff5)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 
18 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22 
23   typedef enum {
24       CUSPARSE_MV_ALG_DEFAULT = 0,
25       CUSPARSE_COOMV_ALG      = 1,
26       CUSPARSE_CSRMV_ALG1     = 2,
27       CUSPARSE_CSRMV_ALG2     = 3
28   } cusparseSpMVAlg_t;
29 
30   typedef enum {
31       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37       CUSPARSE_SPMM_COO_ALG1    = 1,
38       CUSPARSE_SPMM_COO_ALG2    = 2,
39       CUSPARSE_SPMM_COO_ALG3    = 3,
40       CUSPARSE_SPMM_COO_ALG4    = 5,
41       CUSPARSE_SPMM_CSR_ALG1    = 4,
42       CUSPARSE_SPMM_CSR_ALG2    = 6,
43   } cusparseSpMMAlg_t;
44 
45   typedef enum {
46       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48   } cusparseCsr2CscAlg_t;
49   */
50   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53 #endif
54 
55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58 
59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77 
78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
83 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
84 
85 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
86 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
87 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
88 
89 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
90 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93 
94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95 {
96   cusparseStatus_t   stat;
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   cusparseStatus_t   stat;
109   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110 
111   PetscFunctionBegin;
112   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
113   if (cusparsestruct->handle != handle) {
114     if (cusparsestruct->handle) {
115       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
116     }
117     cusparsestruct->handle = handle;
118   }
119   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120   PetscFunctionReturn(0);
121 }
122 
123 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126   PetscBool          flg;
127   PetscErrorCode     ierr;
128 
129   PetscFunctionBegin;
130   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
131   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133   PetscFunctionReturn(0);
134 }
135 
136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
137 {
138   PetscFunctionBegin;
139   *type = MATSOLVERCUSPARSE;
140   PetscFunctionReturn(0);
141 }
142 
143 /*MC
144   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149   algorithms are not recommended. This class does NOT support direct solver operations.
150 
151   Level: beginner
152 
153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154 M*/
155 
156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
157 {
158   PetscErrorCode ierr;
159   PetscInt       n = A->rmap->n;
160 
161   PetscFunctionBegin;
162   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
164   (*B)->factortype = ftype;
165   (*B)->useordering = PETSC_TRUE;
166   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167 
168   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
169     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
170     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
171     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
176 
177   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
178   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
183 {
184   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
185 
186   PetscFunctionBegin;
187   switch (op) {
188   case MAT_CUSPARSE_MULT:
189     cusparsestruct->format = format;
190     break;
191   case MAT_CUSPARSE_ALL:
192     cusparsestruct->format = format;
193     break;
194   default:
195     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /*@
201    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
202    operation. Only the MatMult operation can use different GPU storage formats
203    for MPIAIJCUSPARSE matrices.
204    Not Collective
205 
206    Input Parameters:
207 +  A - Matrix of type SEQAIJCUSPARSE
208 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
209 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
210 
211    Output Parameter:
212 
213    Level: intermediate
214 
215 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
216 @*/
217 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
223   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
224   PetscFunctionReturn(0);
225 }
226 
227 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   switch (op) {
233     case MAT_FORM_EXPLICIT_TRANSPOSE:
234       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
235       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
236       A->form_explicit_transpose = flg;
237       break;
238     default:
239       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
240       break;
241   }
242   PetscFunctionReturn(0);
243 }
244 
245 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
246 {
247   PetscErrorCode           ierr;
248   MatCUSPARSEStorageFormat format;
249   PetscBool                flg;
250   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
251 
252   PetscFunctionBegin;
253   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
254   if (A->factortype == MAT_FACTOR_NONE) {
255     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
256                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
257     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
258 
259     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
260                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
261     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
262    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
263     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
264                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
265     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
266     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
267 
268     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
269                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
270     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
271 
272     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
273                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
274     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
275    #endif
276   }
277   ierr = PetscOptionsTail();CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
282 {
283   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
284   PetscErrorCode               ierr;
285 
286   PetscFunctionBegin;
287   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
288   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
289   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
290   PetscFunctionReturn(0);
291 }
292 
293 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
294 {
295   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
296   PetscErrorCode               ierr;
297 
298   PetscFunctionBegin;
299   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
300   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
301   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
302   PetscFunctionReturn(0);
303 }
304 
305 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
306 {
307   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
308   PetscErrorCode               ierr;
309 
310   PetscFunctionBegin;
311   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
312   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
313   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
325   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
330 {
331   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
332   PetscInt                          n = A->rmap->n;
333   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
334   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
335   cusparseStatus_t                  stat;
336   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
337   const MatScalar                   *aa = a->a,*v;
338   PetscInt                          *AiLo, *AjLo;
339   PetscInt                          i,nz, nzLower, offset, rowOffset;
340   PetscErrorCode                    ierr;
341   cudaError_t                       cerr;
342 
343   PetscFunctionBegin;
344   if (!n) PetscFunctionReturn(0);
345   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
346     try {
347       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
348       nzLower=n+ai[n]-ai[1];
349       if (!loTriFactor) {
350         PetscScalar                       *AALo;
351 
352         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
353 
354         /* Allocate Space for the lower triangular matrix */
355         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
356         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
357 
358         /* Fill the lower triangular matrix */
359         AiLo[0]  = (PetscInt) 0;
360         AiLo[n]  = nzLower;
361         AjLo[0]  = (PetscInt) 0;
362         AALo[0]  = (MatScalar) 1.0;
363         v        = aa;
364         vi       = aj;
365         offset   = 1;
366         rowOffset= 1;
367         for (i=1; i<n; i++) {
368           nz = ai[i+1] - ai[i];
369           /* additional 1 for the term on the diagonal */
370           AiLo[i]    = rowOffset;
371           rowOffset += nz+1;
372 
373           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
374           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
375 
376           offset      += nz;
377           AjLo[offset] = (PetscInt) i;
378           AALo[offset] = (MatScalar) 1.0;
379           offset      += 1;
380 
381           v  += nz;
382           vi += nz;
383         }
384 
385         /* allocate space for the triangular factor information */
386         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
387         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
388         /* Create the matrix description */
389         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
390         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
391        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
392         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
393        #else
394         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
395        #endif
396         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
397         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
398 
399         /* set the operation */
400         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401 
402         /* set the matrix */
403         loTriFactor->csrMat = new CsrMatrix;
404         loTriFactor->csrMat->num_rows = n;
405         loTriFactor->csrMat->num_cols = n;
406         loTriFactor->csrMat->num_entries = nzLower;
407 
408         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
409         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
410 
411         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
412         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
413 
414         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
415         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
416 
417         /* Create the solve analysis information */
418         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
419         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
420       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
421         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
422                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
423                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
424                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
425                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
426         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
427       #endif
428 
429         /* perform the solve analysis */
430         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
431                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
432                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
433                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
434                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
435                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
436                                #endif
437 );CHKERRCUSPARSE(stat);
438         cerr = WaitForCUDA();CHKERRCUDA(cerr);
439         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
440 
441         /* assign the pointer */
442         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
443         loTriFactor->AA_h = AALo;
444         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
445         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
446         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
447       } else { /* update values only */
448         if (!loTriFactor->AA_h) {
449           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
450         }
451         /* Fill the lower triangular matrix */
452         loTriFactor->AA_h[0]  = 1.0;
453         v        = aa;
454         vi       = aj;
455         offset   = 1;
456         for (i=1; i<n; i++) {
457           nz = ai[i+1] - ai[i];
458           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
459           offset      += nz;
460           loTriFactor->AA_h[offset] = 1.0;
461           offset      += 1;
462           v  += nz;
463         }
464         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
465         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
466       }
467     } catch(char *ex) {
468       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
469     }
470   }
471   PetscFunctionReturn(0);
472 }
473 
474 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
475 {
476   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
477   PetscInt                          n = A->rmap->n;
478   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
480   cusparseStatus_t                  stat;
481   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
482   const MatScalar                   *aa = a->a,*v;
483   PetscInt                          *AiUp, *AjUp;
484   PetscInt                          i,nz, nzUpper, offset;
485   PetscErrorCode                    ierr;
486   cudaError_t                       cerr;
487 
488   PetscFunctionBegin;
489   if (!n) PetscFunctionReturn(0);
490   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
491     try {
492       /* next, figure out the number of nonzeros in the upper triangular matrix. */
493       nzUpper = adiag[0]-adiag[n];
494       if (!upTriFactor) {
495         PetscScalar *AAUp;
496 
497         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
498 
499         /* Allocate Space for the upper triangular matrix */
500         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
501         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
502 
503         /* Fill the upper triangular matrix */
504         AiUp[0]=(PetscInt) 0;
505         AiUp[n]=nzUpper;
506         offset = nzUpper;
507         for (i=n-1; i>=0; i--) {
508           v  = aa + adiag[i+1] + 1;
509           vi = aj + adiag[i+1] + 1;
510 
511           /* number of elements NOT on the diagonal */
512           nz = adiag[i] - adiag[i+1]-1;
513 
514           /* decrement the offset */
515           offset -= (nz+1);
516 
517           /* first, set the diagonal elements */
518           AjUp[offset] = (PetscInt) i;
519           AAUp[offset] = (MatScalar)1./v[nz];
520           AiUp[i]      = AiUp[i+1] - (nz+1);
521 
522           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
523           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
524         }
525 
526         /* allocate space for the triangular factor information */
527         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
528         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
529 
530         /* Create the matrix description */
531         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
532         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
533        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
534         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
535        #else
536         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
537        #endif
538         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
539         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
540 
541         /* set the operation */
542         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
543 
544         /* set the matrix */
545         upTriFactor->csrMat = new CsrMatrix;
546         upTriFactor->csrMat->num_rows = n;
547         upTriFactor->csrMat->num_cols = n;
548         upTriFactor->csrMat->num_entries = nzUpper;
549 
550         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
551         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
552 
553         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
554         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
555 
556         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
557         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
558 
559         /* Create the solve analysis information */
560         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
561         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
562       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
563         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
564                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
565                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
566                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
567                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
568         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
569       #endif
570 
571         /* perform the solve analysis */
572         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
576                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
577                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
578                                #endif
579 );CHKERRCUSPARSE(stat);
580         cerr = WaitForCUDA();CHKERRCUDA(cerr);
581         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
582 
583         /* assign the pointer */
584         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
585         upTriFactor->AA_h = AAUp;
586         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
587         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
588         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
589       } else {
590         if (!upTriFactor->AA_h) {
591           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
592         }
593         /* Fill the upper triangular matrix */
594         offset = nzUpper;
595         for (i=n-1; i>=0; i--) {
596           v  = aa + adiag[i+1] + 1;
597 
598           /* number of elements NOT on the diagonal */
599           nz = adiag[i] - adiag[i+1]-1;
600 
601           /* decrement the offset */
602           offset -= (nz+1);
603 
604           /* first, set the diagonal elements */
605           upTriFactor->AA_h[offset] = 1./v[nz];
606           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
607         }
608         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
609         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
610       }
611     } catch(char *ex) {
612       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
613     }
614   }
615   PetscFunctionReturn(0);
616 }
617 
618 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
619 {
620   PetscErrorCode               ierr;
621   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
622   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
623   IS                           isrow = a->row,iscol = a->icol;
624   PetscBool                    row_identity,col_identity;
625   PetscInt                     n = A->rmap->n;
626 
627   PetscFunctionBegin;
628   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
629   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
630   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
631 
632   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
633   cusparseTriFactors->nnz=a->nz;
634 
635   A->offloadmask = PETSC_OFFLOAD_BOTH;
636   /* lower triangular indices */
637   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
638   if (!row_identity && !cusparseTriFactors->rpermIndices) {
639     const PetscInt *r;
640 
641     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
642     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643     cusparseTriFactors->rpermIndices->assign(r, r+n);
644     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
645     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
646   }
647 
648   /* upper triangular indices */
649   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
650   if (!col_identity && !cusparseTriFactors->cpermIndices) {
651     const PetscInt *c;
652 
653     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
654     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
655     cusparseTriFactors->cpermIndices->assign(c, c+n);
656     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
657     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
658   }
659   PetscFunctionReturn(0);
660 }
661 
662 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
663 {
664   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
665   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
667   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
668   cusparseStatus_t                  stat;
669   PetscErrorCode                    ierr;
670   cudaError_t                       cerr;
671   PetscInt                          *AiUp, *AjUp;
672   PetscScalar                       *AAUp;
673   PetscScalar                       *AALo;
674   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
675   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
676   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
677   const MatScalar                   *aa = b->a,*v;
678 
679   PetscFunctionBegin;
680   if (!n) PetscFunctionReturn(0);
681   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
682     try {
683       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
684       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
685       if (!upTriFactor && !loTriFactor) {
686         /* Allocate Space for the upper triangular matrix */
687         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
688         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
689 
690         /* Fill the upper triangular matrix */
691         AiUp[0]=(PetscInt) 0;
692         AiUp[n]=nzUpper;
693         offset = 0;
694         for (i=0; i<n; i++) {
695           /* set the pointers */
696           v  = aa + ai[i];
697           vj = aj + ai[i];
698           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
699 
700           /* first, set the diagonal elements */
701           AjUp[offset] = (PetscInt) i;
702           AAUp[offset] = (MatScalar)1.0/v[nz];
703           AiUp[i]      = offset;
704           AALo[offset] = (MatScalar)1.0/v[nz];
705 
706           offset+=1;
707           if (nz>0) {
708             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
709             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
710             for (j=offset; j<offset+nz; j++) {
711               AAUp[j] = -AAUp[j];
712               AALo[j] = AAUp[j]/v[nz];
713             }
714             offset+=nz;
715           }
716         }
717 
718         /* allocate space for the triangular factor information */
719         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
720         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
721 
722         /* Create the matrix description */
723         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
724         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
725        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
726         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
727        #else
728         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
729        #endif
730         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
731         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
732 
733         /* set the matrix */
734         upTriFactor->csrMat = new CsrMatrix;
735         upTriFactor->csrMat->num_rows = A->rmap->n;
736         upTriFactor->csrMat->num_cols = A->cmap->n;
737         upTriFactor->csrMat->num_entries = a->nz;
738 
739         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
740         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
741 
742         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
743         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
744 
745         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
746         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
747 
748         /* set the operation */
749         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
750 
751         /* Create the solve analysis information */
752         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
753         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
754       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
755         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
756                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
757                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
758                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
759                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
760         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
761       #endif
762 
763         /* perform the solve analysis */
764         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
765                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
766                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
767                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
768                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
770                                 #endif
771 );CHKERRCUSPARSE(stat);
772         cerr = WaitForCUDA();CHKERRCUDA(cerr);
773         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
774 
775         /* assign the pointer */
776         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
777 
778         /* allocate space for the triangular factor information */
779         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
780         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
781 
782         /* Create the matrix description */
783         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
784         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
785        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
786         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
787        #else
788         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
789        #endif
790         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
791         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
792 
793         /* set the operation */
794         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
795 
796         /* set the matrix */
797         loTriFactor->csrMat = new CsrMatrix;
798         loTriFactor->csrMat->num_rows = A->rmap->n;
799         loTriFactor->csrMat->num_cols = A->cmap->n;
800         loTriFactor->csrMat->num_entries = a->nz;
801 
802         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
803         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
804 
805         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
806         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
807 
808         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
809         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
810 
811         /* Create the solve analysis information */
812         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
813         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
814       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
815         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
816                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
817                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
818                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
819                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
820         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
821       #endif
822 
823         /* perform the solve analysis */
824         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
825                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
826                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
827                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
828                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
829                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
830                                 #endif
831 );CHKERRCUSPARSE(stat);
832         cerr = WaitForCUDA();CHKERRCUDA(cerr);
833         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
834 
835         /* assign the pointer */
836         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
837 
838         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
839         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
840         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
841       } else {
842         /* Fill the upper triangular matrix */
843         offset = 0;
844         for (i=0; i<n; i++) {
845           /* set the pointers */
846           v  = aa + ai[i];
847           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
848 
849           /* first, set the diagonal elements */
850           AAUp[offset] = 1.0/v[nz];
851           AALo[offset] = 1.0/v[nz];
852 
853           offset+=1;
854           if (nz>0) {
855             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
856             for (j=offset; j<offset+nz; j++) {
857               AAUp[j] = -AAUp[j];
858               AALo[j] = AAUp[j]/v[nz];
859             }
860             offset+=nz;
861           }
862         }
863         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
864         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
865         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
866         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
867         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
868       }
869       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
870       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
871     } catch(char *ex) {
872       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
873     }
874   }
875   PetscFunctionReturn(0);
876 }
877 
878 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
879 {
880   PetscErrorCode               ierr;
881   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
882   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
883   IS                           ip = a->row;
884   PetscBool                    perm_identity;
885   PetscInt                     n = A->rmap->n;
886 
887   PetscFunctionBegin;
888   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
889   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
890   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
891   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
892 
893   A->offloadmask = PETSC_OFFLOAD_BOTH;
894 
895   /* lower triangular indices */
896   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
897   if (!perm_identity) {
898     IS             iip;
899     const PetscInt *irip,*rip;
900 
901     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
902     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
903     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
904     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
905     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
906     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
907     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
908     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
909     ierr = ISDestroy(&iip);CHKERRQ(ierr);
910     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
911     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
912   }
913   PetscFunctionReturn(0);
914 }
915 
916 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
917 {
918   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
919   IS             isrow = b->row,iscol = b->col;
920   PetscBool      row_identity,col_identity;
921   PetscErrorCode ierr;
922 
923   PetscFunctionBegin;
924   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
925   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
926   B->offloadmask = PETSC_OFFLOAD_CPU;
927   /* determine which version of MatSolve needs to be used. */
928   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
929   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
930   if (row_identity && col_identity) {
931     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
932     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
933     B->ops->matsolve = NULL;
934     B->ops->matsolvetranspose = NULL;
935   } else {
936     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
937     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
938     B->ops->matsolve = NULL;
939     B->ops->matsolvetranspose = NULL;
940   }
941 
942   /* get the triangular factors */
943   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
944   PetscFunctionReturn(0);
945 }
946 
947 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
948 {
949   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
950   IS             ip = b->row;
951   PetscBool      perm_identity;
952   PetscErrorCode ierr;
953 
954   PetscFunctionBegin;
955   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
956   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
957   B->offloadmask = PETSC_OFFLOAD_CPU;
958   /* determine which version of MatSolve needs to be used. */
959   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
960   if (perm_identity) {
961     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
962     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
963     B->ops->matsolve = NULL;
964     B->ops->matsolvetranspose = NULL;
965   } else {
966     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
967     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
968     B->ops->matsolve = NULL;
969     B->ops->matsolvetranspose = NULL;
970   }
971 
972   /* get the triangular factors */
973   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
974   PetscFunctionReturn(0);
975 }
976 
977 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
978 {
979   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
980   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
981   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
982   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
983   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
984   cusparseStatus_t                  stat;
985   cusparseIndexBase_t               indexBase;
986   cusparseMatrixType_t              matrixType;
987   cusparseFillMode_t                fillMode;
988   cusparseDiagType_t                diagType;
989   cudaError_t                       cerr;
990   PetscErrorCode                    ierr;
991 
992   PetscFunctionBegin;
993   /* allocate space for the transpose of the lower triangular factor */
994   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
995   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
996 
997   /* set the matrix descriptors of the lower triangular factor */
998   matrixType = cusparseGetMatType(loTriFactor->descr);
999   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1000   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1003 
1004   /* Create the matrix description */
1005   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1006   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1007   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1008   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1009   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1010 
1011   /* set the operation */
1012   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1013 
1014   /* allocate GPU space for the CSC of the lower triangular factor*/
1015   loTriFactorT->csrMat = new CsrMatrix;
1016   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1017   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1018   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1019   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1020   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1021   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1022 
1023   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1024 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1026                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1027                                        loTriFactor->csrMat->values->data().get(),
1028                                        loTriFactor->csrMat->row_offsets->data().get(),
1029                                        loTriFactor->csrMat->column_indices->data().get(),
1030                                        loTriFactorT->csrMat->values->data().get(),
1031                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1033                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035 #endif
1036 
1037   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1038   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                           loTriFactor->csrMat->values->data().get(),
1041                           loTriFactor->csrMat->row_offsets->data().get(),
1042                           loTriFactor->csrMat->column_indices->data().get(),
1043                           loTriFactorT->csrMat->values->data().get(),
1044                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1046                           CUSPARSE_ACTION_NUMERIC, indexBase,
1047                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1048                         #else
1049                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1050                           CUSPARSE_ACTION_NUMERIC, indexBase
1051                         #endif
1052 );CHKERRCUSPARSE(stat);
1053   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1054   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1055 
1056   /* Create the solve analysis information */
1057   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1058   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1059 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1060   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1061                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1062                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1064                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1065   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1066 #endif
1067 
1068   /* perform the solve analysis */
1069   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1070                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1071                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1072                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1073                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1074                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1075                           #endif
1076 );CHKERRCUSPARSE(stat);
1077   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1078   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1079 
1080   /* assign the pointer */
1081   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1082 
1083   /*********************************************/
1084   /* Now the Transpose of the Upper Tri Factor */
1085   /*********************************************/
1086 
1087   /* allocate space for the transpose of the upper triangular factor */
1088   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1089   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1090 
1091   /* set the matrix descriptors of the upper triangular factor */
1092   matrixType = cusparseGetMatType(upTriFactor->descr);
1093   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1094   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1095     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1096   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1097 
1098   /* Create the matrix description */
1099   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1100   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1101   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1102   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1103   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1104 
1105   /* set the operation */
1106   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1107 
1108   /* allocate GPU space for the CSC of the upper triangular factor*/
1109   upTriFactorT->csrMat = new CsrMatrix;
1110   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1111   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1112   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1113   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1114   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1115   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1116 
1117   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1118 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1119   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1120                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1121                                 upTriFactor->csrMat->values->data().get(),
1122                                 upTriFactor->csrMat->row_offsets->data().get(),
1123                                 upTriFactor->csrMat->column_indices->data().get(),
1124                                 upTriFactorT->csrMat->values->data().get(),
1125                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1126                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1127                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1128   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1129 #endif
1130 
1131   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1132   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1133                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1134                           upTriFactor->csrMat->values->data().get(),
1135                           upTriFactor->csrMat->row_offsets->data().get(),
1136                           upTriFactor->csrMat->column_indices->data().get(),
1137                           upTriFactorT->csrMat->values->data().get(),
1138                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140                           CUSPARSE_ACTION_NUMERIC, indexBase,
1141                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1142                         #else
1143                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1144                           CUSPARSE_ACTION_NUMERIC, indexBase
1145                         #endif
1146 );CHKERRCUSPARSE(stat);
1147   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1148   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1149 
1150   /* Create the solve analysis information */
1151   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1152   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1153   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1154   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1155                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1156                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1157                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1158                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1159   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1160   #endif
1161 
1162   /* perform the solve analysis */
1163   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1164                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1165                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1166                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1167                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1169                           #endif
1170 );CHKERRCUSPARSE(stat);
1171   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1172   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1173 
1174   /* assign the pointer */
1175   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 struct PetscScalarToPetscInt
1180 {
1181   __host__ __device__
1182   PetscInt operator()(PetscScalar s)
1183   {
1184     return (PetscInt)PetscRealPart(s);
1185   }
1186 };
1187 
1188 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1189 {
1190   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1191   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1192   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1193   cusparseStatus_t             stat;
1194   cusparseIndexBase_t          indexBase;
1195   cudaError_t                  err;
1196   PetscErrorCode               ierr;
1197 
1198   PetscFunctionBegin;
1199   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1200   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1201   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1202   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1203   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1204   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1205   if (A->transupdated) PetscFunctionReturn(0);
1206   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1207   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1208     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1209   }
1210   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1211     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1212     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1213     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1214     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1215     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1216 
1217     /* set alpha and beta */
1218     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1219     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1220     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1221     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1222     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1223     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1224 
1225     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1226       CsrMatrix *matrixT = new CsrMatrix;
1227       matstructT->mat = matrixT;
1228       matrixT->num_rows = A->cmap->n;
1229       matrixT->num_cols = A->rmap->n;
1230       matrixT->num_entries = a->nz;
1231       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1232       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1233       matrixT->values = new THRUSTARRAY(a->nz);
1234 
1235       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1236       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1237 
1238      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1239       stat = cusparseCreateCsr(&matstructT->matDescr,
1240                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1241                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1242                                matrixT->values->data().get(),
1243                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1244                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1245      #endif
1246     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1247    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1248       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1249    #else
1250       CsrMatrix *temp  = new CsrMatrix;
1251       CsrMatrix *tempT = new CsrMatrix;
1252       /* First convert HYB to CSR */
1253       temp->num_rows = A->rmap->n;
1254       temp->num_cols = A->cmap->n;
1255       temp->num_entries = a->nz;
1256       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1257       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1258       temp->values = new THRUSTARRAY(a->nz);
1259 
1260       stat = cusparse_hyb2csr(cusparsestruct->handle,
1261                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1262                               temp->values->data().get(),
1263                               temp->row_offsets->data().get(),
1264                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1265 
1266       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1267       tempT->num_rows = A->rmap->n;
1268       tempT->num_cols = A->cmap->n;
1269       tempT->num_entries = a->nz;
1270       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1271       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1272       tempT->values = new THRUSTARRAY(a->nz);
1273 
1274       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1275                               temp->num_cols, temp->num_entries,
1276                               temp->values->data().get(),
1277                               temp->row_offsets->data().get(),
1278                               temp->column_indices->data().get(),
1279                               tempT->values->data().get(),
1280                               tempT->column_indices->data().get(),
1281                               tempT->row_offsets->data().get(),
1282                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1283 
1284       /* Last, convert CSC to HYB */
1285       cusparseHybMat_t hybMat;
1286       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1287       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1288         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1289       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1290                               matstructT->descr, tempT->values->data().get(),
1291                               tempT->row_offsets->data().get(),
1292                               tempT->column_indices->data().get(),
1293                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1294 
1295       /* assign the pointer */
1296       matstructT->mat = hybMat;
1297       A->transupdated = PETSC_TRUE;
1298       /* delete temporaries */
1299       if (tempT) {
1300         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1301         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1302         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1303         delete (CsrMatrix*) tempT;
1304       }
1305       if (temp) {
1306         if (temp->values) delete (THRUSTARRAY*) temp->values;
1307         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1308         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1309         delete (CsrMatrix*) temp;
1310       }
1311      #endif
1312     }
1313   }
1314   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1315     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1316     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1317     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1318     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1319     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1320     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1321     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1322     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1323     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1324     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1325     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1326       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1327       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1328       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1329     }
1330     if (!cusparsestruct->csr2csc_i) {
1331       THRUSTARRAY csr2csc_a(matrix->num_entries);
1332       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1333 
1334       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1335      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1336       void   *csr2cscBuffer;
1337       size_t csr2cscBufferSize;
1338       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1339                                            A->cmap->n, matrix->num_entries,
1340                                            matrix->values->data().get(),
1341                                            cusparsestruct->rowoffsets_gpu->data().get(),
1342                                            matrix->column_indices->data().get(),
1343                                            matrixT->values->data().get(),
1344                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1345                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1346                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1347       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1348      #endif
1349 
1350       if (matrix->num_entries) {
1351         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1352            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1353            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1354 
1355            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1356            should be filled with indexBase. So I just take a shortcut here.
1357         */
1358         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1359                               A->cmap->n,matrix->num_entries,
1360                               csr2csc_a.data().get(),
1361                               cusparsestruct->rowoffsets_gpu->data().get(),
1362                               matrix->column_indices->data().get(),
1363                               matrixT->values->data().get(),
1364                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1365                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1366                               CUSPARSE_ACTION_NUMERIC,indexBase,
1367                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1368                              #else
1369                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1370                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1371                              #endif
1372       } else {
1373         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1374       }
1375 
1376       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1377       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1378      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1379       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1380      #endif
1381     }
1382     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1383                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1384                                                      matrixT->values->begin()));
1385   }
1386   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1387   /* the compressed row indices is not used for matTranspose */
1388   matstructT->cprowIndices = NULL;
1389   /* assign the pointer */
1390   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1391   A->transupdated = PETSC_TRUE;
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1396 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1397 {
1398   PetscInt                              n = xx->map->n;
1399   const PetscScalar                     *barray;
1400   PetscScalar                           *xarray;
1401   thrust::device_ptr<const PetscScalar> bGPU;
1402   thrust::device_ptr<PetscScalar>       xGPU;
1403   cusparseStatus_t                      stat;
1404   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1405   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1406   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1407   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1408   PetscErrorCode                        ierr;
1409   cudaError_t                           cerr;
1410 
1411   PetscFunctionBegin;
1412   /* Analyze the matrix and create the transpose ... on the fly */
1413   if (!loTriFactorT && !upTriFactorT) {
1414     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1415     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1416     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1417   }
1418 
1419   /* Get the GPU pointers */
1420   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1421   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1422   xGPU = thrust::device_pointer_cast(xarray);
1423   bGPU = thrust::device_pointer_cast(barray);
1424 
1425   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1426   /* First, reorder with the row permutation */
1427   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1428                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1429                xGPU);
1430 
1431   /* First, solve U */
1432   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1433                         upTriFactorT->csrMat->num_rows,
1434                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1435                         upTriFactorT->csrMat->num_entries,
1436                       #endif
1437                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1438                         upTriFactorT->csrMat->values->data().get(),
1439                         upTriFactorT->csrMat->row_offsets->data().get(),
1440                         upTriFactorT->csrMat->column_indices->data().get(),
1441                         upTriFactorT->solveInfo,
1442                         xarray, tempGPU->data().get()
1443                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1445                       #endif
1446 );CHKERRCUSPARSE(stat);
1447 
1448   /* Then, solve L */
1449   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1450                         loTriFactorT->csrMat->num_rows,
1451                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1452                         loTriFactorT->csrMat->num_entries,
1453                       #endif
1454                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1455                         loTriFactorT->csrMat->values->data().get(),
1456                         loTriFactorT->csrMat->row_offsets->data().get(),
1457                         loTriFactorT->csrMat->column_indices->data().get(),
1458                         loTriFactorT->solveInfo,
1459                         tempGPU->data().get(), xarray
1460                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1461                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1462                       #endif
1463 );CHKERRCUSPARSE(stat);
1464 
1465   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1466   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1467                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1468                tempGPU->begin());
1469 
1470   /* Copy the temporary to the full solution. */
1471   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1472 
1473   /* restore */
1474   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1475   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1476   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1477   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1478   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1483 {
1484   const PetscScalar                 *barray;
1485   PetscScalar                       *xarray;
1486   cusparseStatus_t                  stat;
1487   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1488   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1489   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1490   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1491   PetscErrorCode                    ierr;
1492   cudaError_t                       cerr;
1493 
1494   PetscFunctionBegin;
1495   /* Analyze the matrix and create the transpose ... on the fly */
1496   if (!loTriFactorT && !upTriFactorT) {
1497     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1498     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1499     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1500   }
1501 
1502   /* Get the GPU pointers */
1503   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1504   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1505 
1506   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1507   /* First, solve U */
1508   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1509                         upTriFactorT->csrMat->num_rows,
1510                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1511                         upTriFactorT->csrMat->num_entries,
1512                       #endif
1513                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1514                         upTriFactorT->csrMat->values->data().get(),
1515                         upTriFactorT->csrMat->row_offsets->data().get(),
1516                         upTriFactorT->csrMat->column_indices->data().get(),
1517                         upTriFactorT->solveInfo,
1518                         barray, tempGPU->data().get()
1519                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1520                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1521                       #endif
1522 );CHKERRCUSPARSE(stat);
1523 
1524   /* Then, solve L */
1525   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1526                         loTriFactorT->csrMat->num_rows,
1527                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1528                         loTriFactorT->csrMat->num_entries,
1529                       #endif
1530                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1531                         loTriFactorT->csrMat->values->data().get(),
1532                         loTriFactorT->csrMat->row_offsets->data().get(),
1533                         loTriFactorT->csrMat->column_indices->data().get(),
1534                         loTriFactorT->solveInfo,
1535                         tempGPU->data().get(), xarray
1536                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1538                       #endif
1539 );CHKERRCUSPARSE(stat);
1540 
1541   /* restore */
1542   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1543   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1544   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1545   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1546   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1551 {
1552   const PetscScalar                     *barray;
1553   PetscScalar                           *xarray;
1554   thrust::device_ptr<const PetscScalar> bGPU;
1555   thrust::device_ptr<PetscScalar>       xGPU;
1556   cusparseStatus_t                      stat;
1557   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1558   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1559   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1560   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1561   PetscErrorCode                        ierr;
1562   cudaError_t                           cerr;
1563 
1564   PetscFunctionBegin;
1565 
1566   /* Get the GPU pointers */
1567   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1569   xGPU = thrust::device_pointer_cast(xarray);
1570   bGPU = thrust::device_pointer_cast(barray);
1571 
1572   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1573   /* First, reorder with the row permutation */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1575                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1576                tempGPU->begin());
1577 
1578   /* Next, solve L */
1579   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1580                         loTriFactor->csrMat->num_rows,
1581                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1582                         loTriFactor->csrMat->num_entries,
1583                       #endif
1584                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1585                         loTriFactor->csrMat->values->data().get(),
1586                         loTriFactor->csrMat->row_offsets->data().get(),
1587                         loTriFactor->csrMat->column_indices->data().get(),
1588                         loTriFactor->solveInfo,
1589                         tempGPU->data().get(), xarray
1590                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1592                       #endif
1593 );CHKERRCUSPARSE(stat);
1594 
1595   /* Then, solve U */
1596   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1597                         upTriFactor->csrMat->num_rows,
1598                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599                         upTriFactor->csrMat->num_entries,
1600                       #endif
1601                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1602                         upTriFactor->csrMat->values->data().get(),
1603                         upTriFactor->csrMat->row_offsets->data().get(),
1604                         upTriFactor->csrMat->column_indices->data().get(),
1605                         upTriFactor->solveInfo,
1606                         xarray, tempGPU->data().get()
1607                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1609                       #endif
1610 );CHKERRCUSPARSE(stat);
1611 
1612   /* Last, reorder with the column permutation */
1613   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1614                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1615                xGPU);
1616 
1617   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1618   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1619   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1620   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1621   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1626 {
1627   const PetscScalar                 *barray;
1628   PetscScalar                       *xarray;
1629   cusparseStatus_t                  stat;
1630   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1631   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1633   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1634   PetscErrorCode                    ierr;
1635   cudaError_t                       cerr;
1636 
1637   PetscFunctionBegin;
1638   /* Get the GPU pointers */
1639   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1641 
1642   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1643   /* First, solve L */
1644   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1645                         loTriFactor->csrMat->num_rows,
1646                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647                         loTriFactor->csrMat->num_entries,
1648                       #endif
1649                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1650                         loTriFactor->csrMat->values->data().get(),
1651                         loTriFactor->csrMat->row_offsets->data().get(),
1652                         loTriFactor->csrMat->column_indices->data().get(),
1653                         loTriFactor->solveInfo,
1654                         barray, tempGPU->data().get()
1655                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1656                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1657                       #endif
1658 );CHKERRCUSPARSE(stat);
1659 
1660   /* Next, solve U */
1661   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1662                         upTriFactor->csrMat->num_rows,
1663                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1664                         upTriFactor->csrMat->num_entries,
1665                       #endif
1666                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1667                         upTriFactor->csrMat->values->data().get(),
1668                         upTriFactor->csrMat->row_offsets->data().get(),
1669                         upTriFactor->csrMat->column_indices->data().get(),
1670                         upTriFactor->solveInfo,
1671                         tempGPU->data().get(), xarray
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1674                       #endif
1675 );CHKERRCUSPARSE(stat);
1676 
1677   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1678   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1679   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1680   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1681   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1686 {
1687   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1688   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1689   cudaError_t        cerr;
1690   PetscErrorCode     ierr;
1691 
1692   PetscFunctionBegin;
1693   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1694     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1695 
1696     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1697     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1698     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1699     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1700     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1701     A->offloadmask = PETSC_OFFLOAD_BOTH;
1702   }
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1707 {
1708   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1709   PetscErrorCode ierr;
1710 
1711   PetscFunctionBegin;
1712   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1713   *array = a->a;
1714   A->offloadmask = PETSC_OFFLOAD_CPU;
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1719 {
1720   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1721   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1722   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1723   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1724   PetscErrorCode               ierr;
1725   cusparseStatus_t             stat;
1726   PetscBool                    both = PETSC_TRUE;
1727   cudaError_t                  err;
1728 
1729   PetscFunctionBegin;
1730   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1731   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1732     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1733       CsrMatrix *matrix;
1734       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1735 
1736       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1737       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1738       matrix->values->assign(a->a, a->a+a->nz);
1739       err  = WaitForCUDA();CHKERRCUDA(err);
1740       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1741       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1742       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1743     } else {
1744       PetscInt nnz;
1745       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1746       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1747       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1748       delete cusparsestruct->workVector;
1749       delete cusparsestruct->rowoffsets_gpu;
1750       cusparsestruct->workVector = NULL;
1751       cusparsestruct->rowoffsets_gpu = NULL;
1752       try {
1753         if (a->compressedrow.use) {
1754           m    = a->compressedrow.nrows;
1755           ii   = a->compressedrow.i;
1756           ridx = a->compressedrow.rindex;
1757         } else {
1758           m    = A->rmap->n;
1759           ii   = a->i;
1760           ridx = NULL;
1761         }
1762         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1763         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1764         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1765         else nnz = a->nz;
1766 
1767         /* create cusparse matrix */
1768         cusparsestruct->nrows = m;
1769         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1770         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1771         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1772         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1773 
1774         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1775         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1776         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1777         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1778         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1779         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1780         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1781 
1782         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1783         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1784           /* set the matrix */
1785           CsrMatrix *mat= new CsrMatrix;
1786           mat->num_rows = m;
1787           mat->num_cols = A->cmap->n;
1788           mat->num_entries = nnz;
1789           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1790           mat->row_offsets->assign(ii, ii + m+1);
1791 
1792           mat->column_indices = new THRUSTINTARRAY32(nnz);
1793           mat->column_indices->assign(a->j, a->j+nnz);
1794 
1795           mat->values = new THRUSTARRAY(nnz);
1796           if (a->a) mat->values->assign(a->a, a->a+nnz);
1797 
1798           /* assign the pointer */
1799           matstruct->mat = mat;
1800          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1801           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1802             stat = cusparseCreateCsr(&matstruct->matDescr,
1803                                     mat->num_rows, mat->num_cols, mat->num_entries,
1804                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1805                                     mat->values->data().get(),
1806                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1807                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1808           }
1809          #endif
1810         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1811          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1812           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1813          #else
1814           CsrMatrix *mat= new CsrMatrix;
1815           mat->num_rows = m;
1816           mat->num_cols = A->cmap->n;
1817           mat->num_entries = nnz;
1818           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1819           mat->row_offsets->assign(ii, ii + m+1);
1820 
1821           mat->column_indices = new THRUSTINTARRAY32(nnz);
1822           mat->column_indices->assign(a->j, a->j+nnz);
1823 
1824           mat->values = new THRUSTARRAY(nnz);
1825           if (a->a) mat->values->assign(a->a, a->a+nnz);
1826 
1827           cusparseHybMat_t hybMat;
1828           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1829           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1830             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1831           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1832               matstruct->descr, mat->values->data().get(),
1833               mat->row_offsets->data().get(),
1834               mat->column_indices->data().get(),
1835               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1836           /* assign the pointer */
1837           matstruct->mat = hybMat;
1838 
1839           if (mat) {
1840             if (mat->values) delete (THRUSTARRAY*)mat->values;
1841             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1842             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1843             delete (CsrMatrix*)mat;
1844           }
1845          #endif
1846         }
1847 
1848         /* assign the compressed row indices */
1849         if (a->compressedrow.use) {
1850           cusparsestruct->workVector = new THRUSTARRAY(m);
1851           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1852           matstruct->cprowIndices->assign(ridx,ridx+m);
1853           tmp = m;
1854         } else {
1855           cusparsestruct->workVector = NULL;
1856           matstruct->cprowIndices    = NULL;
1857           tmp = 0;
1858         }
1859         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1860 
1861         /* assign the pointer */
1862         cusparsestruct->mat = matstruct;
1863       } catch(char *ex) {
1864         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1865       }
1866       err  = WaitForCUDA();CHKERRCUDA(err);
1867       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1868       cusparsestruct->nonzerostate = A->nonzerostate;
1869     }
1870     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1871   }
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 struct VecCUDAPlusEquals
1876 {
1877   template <typename Tuple>
1878   __host__ __device__
1879   void operator()(Tuple t)
1880   {
1881     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1882   }
1883 };
1884 
1885 struct VecCUDAEquals
1886 {
1887   template <typename Tuple>
1888   __host__ __device__
1889   void operator()(Tuple t)
1890   {
1891     thrust::get<1>(t) = thrust::get<0>(t);
1892   }
1893 };
1894 
1895 struct VecCUDAEqualsReverse
1896 {
1897   template <typename Tuple>
1898   __host__ __device__
1899   void operator()(Tuple t)
1900   {
1901     thrust::get<0>(t) = thrust::get<1>(t);
1902   }
1903 };
1904 
1905 struct MatMatCusparse {
1906   PetscBool             cisdense;
1907   PetscScalar           *Bt;
1908   Mat                   X;
1909   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1910   PetscLogDouble        flops;
1911   CsrMatrix             *Bcsr;
1912 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1913   cusparseSpMatDescr_t  matSpBDescr;
1914   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1915   cusparseDnMatDescr_t  matBDescr;
1916   cusparseDnMatDescr_t  matCDescr;
1917   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1918   size_t                mmBufferSize;
1919   void                  *mmBuffer;
1920   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1921   cusparseSpGEMMDescr_t spgemmDesc;
1922 #endif
1923 };
1924 
1925 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1926 {
1927   PetscErrorCode   ierr;
1928   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1929   cudaError_t      cerr;
1930  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1931   cusparseStatus_t stat;
1932  #endif
1933 
1934   PetscFunctionBegin;
1935   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1936   delete mmdata->Bcsr;
1937  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1939   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1940   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1941   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1942   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1943   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1944  #endif
1945   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1946   ierr = PetscFree(data);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1951 
1952 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1953 {
1954   Mat_Product                  *product = C->product;
1955   Mat                          A,B;
1956   PetscInt                     m,n,blda,clda;
1957   PetscBool                    flg,biscuda;
1958   Mat_SeqAIJCUSPARSE           *cusp;
1959   cusparseStatus_t             stat;
1960   cusparseOperation_t          opA;
1961   const PetscScalar            *barray;
1962   PetscScalar                  *carray;
1963   PetscErrorCode               ierr;
1964   MatMatCusparse               *mmdata;
1965   Mat_SeqAIJCUSPARSEMultStruct *mat;
1966   CsrMatrix                    *csrmat;
1967   cudaError_t                  cerr;
1968 
1969   PetscFunctionBegin;
1970   MatCheckProduct(C,1);
1971   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1972   mmdata = (MatMatCusparse*)product->data;
1973   A    = product->A;
1974   B    = product->B;
1975   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1976   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1977   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1978      Instead of silently accepting the wrong answer, I prefer to raise the error */
1979   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1980   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1981   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1982   switch (product->type) {
1983   case MATPRODUCT_AB:
1984   case MATPRODUCT_PtAP:
1985     mat = cusp->mat;
1986     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1987     m   = A->rmap->n;
1988     n   = B->cmap->n;
1989     break;
1990   case MATPRODUCT_AtB:
1991     if (!A->form_explicit_transpose) {
1992       mat = cusp->mat;
1993       opA = CUSPARSE_OPERATION_TRANSPOSE;
1994     } else {
1995       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
1996       mat  = cusp->matTranspose;
1997       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1998     }
1999     m = A->cmap->n;
2000     n = B->cmap->n;
2001     break;
2002   case MATPRODUCT_ABt:
2003   case MATPRODUCT_RARt:
2004     mat = cusp->mat;
2005     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2006     m   = A->rmap->n;
2007     n   = B->rmap->n;
2008     break;
2009   default:
2010     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2011   }
2012   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2013   csrmat = (CsrMatrix*)mat->mat;
2014   /* if the user passed a CPU matrix, copy the data to the GPU */
2015   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2016   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2017   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2018 
2019   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2020   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2021     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2022     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2023   } else {
2024     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2025     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2026   }
2027 
2028   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2029  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2030   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2031   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2032   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2033     size_t mmBufferSize;
2034     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2035     if (!mmdata->matBDescr) {
2036       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2037       mmdata->Blda = blda;
2038     }
2039 
2040     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2041     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2042       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2043       mmdata->Clda = clda;
2044     }
2045 
2046     if (!mat->matDescr) {
2047       stat = cusparseCreateCsr(&mat->matDescr,
2048                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2049                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2050                                csrmat->values->data().get(),
2051                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2052                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2053     }
2054     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2055                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2056                                    mmdata->matCDescr,cusparse_scalartype,
2057                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2058     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2059       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2060       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2061       mmdata->mmBufferSize = mmBufferSize;
2062     }
2063     mmdata->initialized = PETSC_TRUE;
2064   } else {
2065     /* to be safe, always update pointers of the mats */
2066     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2067     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2068     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2069   }
2070 
2071   /* do cusparseSpMM, which supports transpose on B */
2072   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2073                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2074                       mmdata->matCDescr,cusparse_scalartype,
2075                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2076  #else
2077   PetscInt k;
2078   /* cusparseXcsrmm does not support transpose on B */
2079   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2080     cublasHandle_t cublasv2handle;
2081     cublasStatus_t cerr;
2082 
2083     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2084     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2085                        B->cmap->n,B->rmap->n,
2086                        &PETSC_CUSPARSE_ONE ,barray,blda,
2087                        &PETSC_CUSPARSE_ZERO,barray,blda,
2088                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2089     blda = B->cmap->n;
2090     k    = B->cmap->n;
2091   } else {
2092     k    = B->rmap->n;
2093   }
2094 
2095   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2096   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2097                            csrmat->num_entries,mat->alpha_one,mat->descr,
2098                            csrmat->values->data().get(),
2099                            csrmat->row_offsets->data().get(),
2100                            csrmat->column_indices->data().get(),
2101                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2102                            carray,clda);CHKERRCUSPARSE(stat);
2103  #endif
2104   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2105   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2106   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2107   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2108   if (product->type == MATPRODUCT_RARt) {
2109     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2110     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2111   } else if (product->type == MATPRODUCT_PtAP) {
2112     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2113     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2114   } else {
2115     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2116   }
2117   if (mmdata->cisdense) {
2118     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2119   }
2120   if (!biscuda) {
2121     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2122   }
2123   PetscFunctionReturn(0);
2124 }
2125 
2126 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2127 {
2128   Mat_Product        *product = C->product;
2129   Mat                A,B;
2130   PetscInt           m,n;
2131   PetscBool          cisdense,flg;
2132   PetscErrorCode     ierr;
2133   MatMatCusparse     *mmdata;
2134   Mat_SeqAIJCUSPARSE *cusp;
2135 
2136   PetscFunctionBegin;
2137   MatCheckProduct(C,1);
2138   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2139   A    = product->A;
2140   B    = product->B;
2141   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2142   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2143   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2144   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2145   switch (product->type) {
2146   case MATPRODUCT_AB:
2147     m = A->rmap->n;
2148     n = B->cmap->n;
2149     break;
2150   case MATPRODUCT_AtB:
2151     m = A->cmap->n;
2152     n = B->cmap->n;
2153     break;
2154   case MATPRODUCT_ABt:
2155     m = A->rmap->n;
2156     n = B->rmap->n;
2157     break;
2158   case MATPRODUCT_PtAP:
2159     m = B->cmap->n;
2160     n = B->cmap->n;
2161     break;
2162   case MATPRODUCT_RARt:
2163     m = B->rmap->n;
2164     n = B->rmap->n;
2165     break;
2166   default:
2167     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2168   }
2169   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2170   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2171   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2172   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2173 
2174   /* product data */
2175   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2176   mmdata->cisdense = cisdense;
2177  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2178   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2179   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2180     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2181   }
2182  #endif
2183   /* for these products we need intermediate storage */
2184   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2185     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2186     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2187     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2188       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2189     } else {
2190       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2191     }
2192   }
2193   C->product->data    = mmdata;
2194   C->product->destroy = MatDestroy_MatMatCusparse;
2195 
2196   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2197   PetscFunctionReturn(0);
2198 }
2199 
2200 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2201 {
2202   Mat_Product                  *product = C->product;
2203   Mat                          A,B;
2204   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2205   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2206   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2207   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2208   PetscBool                    flg;
2209   PetscErrorCode               ierr;
2210   cusparseStatus_t             stat;
2211   cudaError_t                  cerr;
2212   MatProductType               ptype;
2213   MatMatCusparse               *mmdata;
2214 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2215   cusparseSpMatDescr_t         BmatSpDescr;
2216 #endif
2217 
2218   PetscFunctionBegin;
2219   MatCheckProduct(C,1);
2220   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2221   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2222   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2223   mmdata = (MatMatCusparse*)C->product->data;
2224   A = product->A;
2225   B = product->B;
2226   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2227     mmdata->reusesym = PETSC_FALSE;
2228     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2229     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2230     Cmat = Ccusp->mat;
2231     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2232     Ccsr = (CsrMatrix*)Cmat->mat;
2233     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2234     goto finalize;
2235   }
2236   if (!c->nz) goto finalize;
2237   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2238   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2239   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2240   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2241   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2242   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2243   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2244   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2245   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2246   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2247   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2248   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2249   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2250   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2251 
2252   ptype = product->type;
2253   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2254   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2255   switch (ptype) {
2256   case MATPRODUCT_AB:
2257     Amat = Acusp->mat;
2258     Bmat = Bcusp->mat;
2259     break;
2260   case MATPRODUCT_AtB:
2261     Amat = Acusp->matTranspose;
2262     Bmat = Bcusp->mat;
2263     break;
2264   case MATPRODUCT_ABt:
2265     Amat = Acusp->mat;
2266     Bmat = Bcusp->matTranspose;
2267     break;
2268   default:
2269     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2270   }
2271   Cmat = Ccusp->mat;
2272   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2273   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2274   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2275   Acsr = (CsrMatrix*)Amat->mat;
2276   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2277   Ccsr = (CsrMatrix*)Cmat->mat;
2278   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2279   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2280   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2281   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2282 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2283   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2284   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2285                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2286                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2287                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2288   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2289                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2290                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2291 #else
2292   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2293                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2294                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2295                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2296                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2297 #endif
2298   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2299   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2300   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2301   C->offloadmask = PETSC_OFFLOAD_GPU;
2302 finalize:
2303   /* shorter version of MatAssemblyEnd_SeqAIJ */
2304   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2305   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2306   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2307   c->reallocs         = 0;
2308   C->info.mallocs    += 0;
2309   C->info.nz_unneeded = 0;
2310   C->assembled = C->was_assembled = PETSC_TRUE;
2311   C->num_ass++;
2312   PetscFunctionReturn(0);
2313 }
2314 
2315 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2316 {
2317   Mat_Product                  *product = C->product;
2318   Mat                          A,B;
2319   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2320   Mat_SeqAIJ                   *a,*b,*c;
2321   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2322   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2323   PetscInt                     i,j,m,n,k;
2324   PetscBool                    flg;
2325   PetscErrorCode               ierr;
2326   cusparseStatus_t             stat;
2327   cudaError_t                  cerr;
2328   MatProductType               ptype;
2329   MatMatCusparse               *mmdata;
2330   PetscLogDouble               flops;
2331   PetscBool                    biscompressed,ciscompressed;
2332 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2333   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2334   size_t                       bufSize2;
2335   cusparseSpMatDescr_t         BmatSpDescr;
2336 #else
2337   int                          cnz;
2338 #endif
2339 
2340   PetscFunctionBegin;
2341   MatCheckProduct(C,1);
2342   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2343   A    = product->A;
2344   B    = product->B;
2345   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2346   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2347   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2348   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2349   a = (Mat_SeqAIJ*)A->data;
2350   b = (Mat_SeqAIJ*)B->data;
2351   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2352   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2353   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2354   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2355 
2356   /* product data */
2357   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2358   C->product->data    = mmdata;
2359   C->product->destroy = MatDestroy_MatMatCusparse;
2360 
2361   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2362   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2363   ptype = product->type;
2364   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2365   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2366   biscompressed = PETSC_FALSE;
2367   ciscompressed = PETSC_FALSE;
2368   switch (ptype) {
2369   case MATPRODUCT_AB:
2370     m = A->rmap->n;
2371     n = B->cmap->n;
2372     k = A->cmap->n;
2373     Amat = Acusp->mat;
2374     Bmat = Bcusp->mat;
2375     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2376     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2377     break;
2378   case MATPRODUCT_AtB:
2379     m = A->cmap->n;
2380     n = B->cmap->n;
2381     k = A->rmap->n;
2382     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2383     Amat = Acusp->matTranspose;
2384     Bmat = Bcusp->mat;
2385     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2386     break;
2387   case MATPRODUCT_ABt:
2388     m = A->rmap->n;
2389     n = B->rmap->n;
2390     k = A->cmap->n;
2391     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2392     Amat = Acusp->mat;
2393     Bmat = Bcusp->matTranspose;
2394     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2395     break;
2396   default:
2397     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2398   }
2399 
2400   /* create cusparse matrix */
2401   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2402   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2403   c     = (Mat_SeqAIJ*)C->data;
2404   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2405   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2406   Ccsr  = new CsrMatrix;
2407 
2408   c->compressedrow.use = ciscompressed;
2409   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2410     c->compressedrow.nrows = a->compressedrow.nrows;
2411     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2412     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2413     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2414     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2415     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2416   } else {
2417     c->compressedrow.nrows  = 0;
2418     c->compressedrow.i      = NULL;
2419     c->compressedrow.rindex = NULL;
2420     Ccusp->workVector       = NULL;
2421     Cmat->cprowIndices      = NULL;
2422   }
2423   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2424   Ccusp->mat      = Cmat;
2425   Ccusp->mat->mat = Ccsr;
2426   Ccsr->num_rows    = Ccusp->nrows;
2427   Ccsr->num_cols    = n;
2428   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2429   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2430   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2431   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2432   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2433   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2434   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2435   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2436   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2437   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2438   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2439     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2440     c->nz = 0;
2441     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2442     Ccsr->values = new THRUSTARRAY(c->nz);
2443     goto finalizesym;
2444   }
2445 
2446   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2447   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2448   Acsr = (CsrMatrix*)Amat->mat;
2449   if (!biscompressed) {
2450     Bcsr = (CsrMatrix*)Bmat->mat;
2451 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2452     BmatSpDescr = Bmat->matDescr;
2453 #endif
2454   } else { /* we need to use row offsets for the full matrix */
2455     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2456     Bcsr = new CsrMatrix;
2457     Bcsr->num_rows       = B->rmap->n;
2458     Bcsr->num_cols       = cBcsr->num_cols;
2459     Bcsr->num_entries    = cBcsr->num_entries;
2460     Bcsr->column_indices = cBcsr->column_indices;
2461     Bcsr->values         = cBcsr->values;
2462     if (!Bcusp->rowoffsets_gpu) {
2463       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2464       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2465       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2466     }
2467     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2468     mmdata->Bcsr = Bcsr;
2469 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2470     if (Bcsr->num_rows && Bcsr->num_cols) {
2471       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2472                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2473                                Bcsr->values->data().get(),
2474                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2475                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2476     }
2477     BmatSpDescr = mmdata->matSpBDescr;
2478 #endif
2479   }
2480   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2481   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2482   /* precompute flops count */
2483   if (ptype == MATPRODUCT_AB) {
2484     for (i=0, flops = 0; i<A->rmap->n; i++) {
2485       const PetscInt st = a->i[i];
2486       const PetscInt en = a->i[i+1];
2487       for (j=st; j<en; j++) {
2488         const PetscInt brow = a->j[j];
2489         flops += 2.*(b->i[brow+1] - b->i[brow]);
2490       }
2491     }
2492   } else if (ptype == MATPRODUCT_AtB) {
2493     for (i=0, flops = 0; i<A->rmap->n; i++) {
2494       const PetscInt anzi = a->i[i+1] - a->i[i];
2495       const PetscInt bnzi = b->i[i+1] - b->i[i];
2496       flops += (2.*anzi)*bnzi;
2497     }
2498   } else { /* TODO */
2499     flops = 0.;
2500   }
2501 
2502   mmdata->flops = flops;
2503   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2504 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2505   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2506   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2507                            NULL, NULL, NULL,
2508                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2509                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2510   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2511   /* ask bufferSize bytes for external memory */
2512   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2513                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2514                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2515                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2516   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2517   /* inspect the matrices A and B to understand the memory requirement for the next step */
2518   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2519                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2520                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2521                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2522   /* ask bufferSize again bytes for external memory */
2523   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2524                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2525                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2526                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2527   /* The CUSPARSE documentation is not clear, nor the API
2528      We need both buffers to perform the operations properly!
2529      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2530      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2531      is stored in the descriptor! What a messy API... */
2532   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2533   /* compute the intermediate product of A * B */
2534   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2535                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2536                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2537                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2538   /* get matrix C non-zero entries C_nnz1 */
2539   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2540   c->nz = (PetscInt) C_nnz1;
2541   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2542   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2543   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2544   Ccsr->values = new THRUSTARRAY(c->nz);
2545   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2546   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2547                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2548   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2549                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2550                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2551 #else
2552   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2553   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2554                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2555                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2556                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2557                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2558   c->nz = cnz;
2559   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2560   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2561   Ccsr->values = new THRUSTARRAY(c->nz);
2562   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2563 
2564   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2565   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2566      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2567      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2568   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2570                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2571                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2572                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2573 #endif
2574   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2575   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2576   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2577 finalizesym:
2578   c->singlemalloc = PETSC_FALSE;
2579   c->free_a       = PETSC_TRUE;
2580   c->free_ij      = PETSC_TRUE;
2581   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2582   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2583   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2584     PetscInt *d_i = c->i;
2585     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2586     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2587     ii   = *Ccsr->row_offsets;
2588     jj   = *Ccsr->column_indices;
2589     if (ciscompressed) d_i = c->compressedrow.i;
2590     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2591     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2592   } else {
2593     PetscInt *d_i = c->i;
2594     if (ciscompressed) d_i = c->compressedrow.i;
2595     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2596     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2597   }
2598   if (ciscompressed) { /* need to expand host row offsets */
2599     PetscInt r = 0;
2600     c->i[0] = 0;
2601     for (k = 0; k < c->compressedrow.nrows; k++) {
2602       const PetscInt next = c->compressedrow.rindex[k];
2603       const PetscInt old = c->compressedrow.i[k];
2604       for (; r < next; r++) c->i[r+1] = old;
2605     }
2606     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2607   }
2608   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2609   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2610   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2611   c->maxnz = c->nz;
2612   c->nonzerorowcnt = 0;
2613   c->rmax = 0;
2614   for (k = 0; k < m; k++) {
2615     const PetscInt nn = c->i[k+1] - c->i[k];
2616     c->ilen[k] = c->imax[k] = nn;
2617     c->nonzerorowcnt += (PetscInt)!!nn;
2618     c->rmax = PetscMax(c->rmax,nn);
2619   }
2620   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2621   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2622   Ccsr->num_entries = c->nz;
2623 
2624   C->nonzerostate++;
2625   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2626   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2627   Ccusp->nonzerostate = C->nonzerostate;
2628   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2629   C->preallocated  = PETSC_TRUE;
2630   C->assembled     = PETSC_FALSE;
2631   C->was_assembled = PETSC_FALSE;
2632   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2633     mmdata->reusesym = PETSC_TRUE;
2634     C->offloadmask   = PETSC_OFFLOAD_GPU;
2635   }
2636   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2637   PetscFunctionReturn(0);
2638 }
2639 
2640 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2641 
2642 /* handles sparse or dense B */
2643 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2644 {
2645   Mat_Product    *product = mat->product;
2646   PetscErrorCode ierr;
2647   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2648 
2649   PetscFunctionBegin;
2650   MatCheckProduct(mat,1);
2651   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2652   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2653     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2654   }
2655   if (product->type == MATPRODUCT_ABC) {
2656     Ciscusp = PETSC_FALSE;
2657     if (!product->C->boundtocpu) {
2658       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2659     }
2660   }
2661   if (isdense) {
2662     switch (product->type) {
2663     case MATPRODUCT_AB:
2664     case MATPRODUCT_AtB:
2665     case MATPRODUCT_ABt:
2666     case MATPRODUCT_PtAP:
2667     case MATPRODUCT_RARt:
2668      if (product->A->boundtocpu) {
2669         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2670       } else {
2671         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2672       }
2673       break;
2674     case MATPRODUCT_ABC:
2675       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2676       break;
2677     default:
2678       break;
2679     }
2680   } else if (Biscusp && Ciscusp) {
2681     switch (product->type) {
2682     case MATPRODUCT_AB:
2683     case MATPRODUCT_AtB:
2684     case MATPRODUCT_ABt:
2685       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2686       break;
2687     case MATPRODUCT_PtAP:
2688     case MATPRODUCT_RARt:
2689     case MATPRODUCT_ABC:
2690       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2691       break;
2692     default:
2693       break;
2694     }
2695   } else { /* fallback for AIJ */
2696     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2697   }
2698   PetscFunctionReturn(0);
2699 }
2700 
2701 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2702 {
2703   PetscErrorCode ierr;
2704 
2705   PetscFunctionBegin;
2706   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2707   PetscFunctionReturn(0);
2708 }
2709 
2710 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2711 {
2712   PetscErrorCode ierr;
2713 
2714   PetscFunctionBegin;
2715   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2716   PetscFunctionReturn(0);
2717 }
2718 
2719 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2720 {
2721   PetscErrorCode ierr;
2722 
2723   PetscFunctionBegin;
2724   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2725   PetscFunctionReturn(0);
2726 }
2727 
2728 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2729 {
2730   PetscErrorCode ierr;
2731 
2732   PetscFunctionBegin;
2733   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2734   PetscFunctionReturn(0);
2735 }
2736 
2737 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2738 {
2739   PetscErrorCode ierr;
2740 
2741   PetscFunctionBegin;
2742   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2747 {
2748   int i = blockIdx.x*blockDim.x + threadIdx.x;
2749   if (i < n) y[idx[i]] += x[i];
2750 }
2751 
2752 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2753 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2754 {
2755   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2756   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2757   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2758   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2759   PetscErrorCode               ierr;
2760   cudaError_t                  cerr;
2761   cusparseStatus_t             stat;
2762   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2763   PetscBool                    compressed;
2764 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2765   PetscInt                     nx,ny;
2766 #endif
2767 
2768   PetscFunctionBegin;
2769   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2770   if (!a->nonzerorowcnt) {
2771     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2772     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2773     PetscFunctionReturn(0);
2774   }
2775   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2776   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2777   if (!trans) {
2778     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2779     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2780   } else {
2781     if (herm || !A->form_explicit_transpose) {
2782       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2783       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2784     } else {
2785       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2786       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2787     }
2788   }
2789   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2790   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2791 
2792   try {
2793     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2794     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2795     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2796 
2797     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2798     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2799       /* z = A x + beta y.
2800          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2801          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2802       */
2803       xptr = xarray;
2804       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2805       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2806      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2807       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2808           allocated to accommodate different uses. So we get the length info directly from mat.
2809        */
2810       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2811         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2812         nx = mat->num_cols;
2813         ny = mat->num_rows;
2814       }
2815      #endif
2816     } else {
2817       /* z = A^T x + beta y
2818          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2819          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2820        */
2821       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2822       dptr = zarray;
2823       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2824       if (compressed) { /* Scatter x to work vector */
2825         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2826         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2827                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2828                          VecCUDAEqualsReverse());
2829       }
2830      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2831       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2832         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2833         nx = mat->num_rows;
2834         ny = mat->num_cols;
2835       }
2836      #endif
2837     }
2838 
2839     /* csr_spmv does y = alpha op(A) x + beta y */
2840     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2841      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2842       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2843       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2844         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2845         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2846         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2847                                 matstruct->matDescr,
2848                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2849                                 matstruct->cuSpMV[opA].vecYDescr,
2850                                 cusparse_scalartype,
2851                                 cusparsestruct->spmvAlg,
2852                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2853         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2854 
2855         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2856       } else {
2857         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2858         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2859         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2860       }
2861 
2862       stat = cusparseSpMV(cusparsestruct->handle, opA,
2863                                matstruct->alpha_one,
2864                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2865                                matstruct->cuSpMV[opA].vecXDescr,
2866                                beta,
2867                                matstruct->cuSpMV[opA].vecYDescr,
2868                                cusparse_scalartype,
2869                                cusparsestruct->spmvAlg,
2870                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2871      #else
2872       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2873       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2874                                mat->num_rows, mat->num_cols,
2875                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2876                                mat->values->data().get(), mat->row_offsets->data().get(),
2877                                mat->column_indices->data().get(), xptr, beta,
2878                                dptr);CHKERRCUSPARSE(stat);
2879      #endif
2880     } else {
2881       if (cusparsestruct->nrows) {
2882        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2883         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2884        #else
2885         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2886         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2887                                  matstruct->alpha_one, matstruct->descr, hybMat,
2888                                  xptr, beta,
2889                                  dptr);CHKERRCUSPARSE(stat);
2890        #endif
2891       }
2892     }
2893     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2894     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2895 
2896     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2897       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2898         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2899           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2900         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2901           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2902         }
2903       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2904         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2905       }
2906 
2907       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2908       if (compressed) {
2909         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2910         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2911            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2912            prevent that. So I just add a ScatterAdd kernel.
2913          */
2914        #if 0
2915         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2916         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2917                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2918                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2919                          VecCUDAPlusEquals());
2920        #else
2921         PetscInt n = matstruct->cprowIndices->size();
2922         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2923        #endif
2924         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2925         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2926       }
2927     } else {
2928       if (yy && yy != zz) {
2929         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2930       }
2931     }
2932     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2933     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2934     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2935   } catch(char *ex) {
2936     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2937   }
2938   if (yy) {
2939     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2940   } else {
2941     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2942   }
2943   PetscFunctionReturn(0);
2944 }
2945 
2946 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2947 {
2948   PetscErrorCode ierr;
2949 
2950   PetscFunctionBegin;
2951   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2952   PetscFunctionReturn(0);
2953 }
2954 
2955 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2956 {
2957   PetscErrorCode              ierr;
2958   PetscSplitCSRDataStructure  *d_mat = NULL;
2959   PetscFunctionBegin;
2960   if (A->factortype == MAT_FACTOR_NONE) {
2961     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2962   }
2963   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2964   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2965   if (d_mat) {
2966     A->offloadmask = PETSC_OFFLOAD_GPU;
2967   }
2968 
2969   PetscFunctionReturn(0);
2970 }
2971 
2972 /* --------------------------------------------------------------------------------*/
2973 /*@
2974    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2975    (the default parallel PETSc format). This matrix will ultimately pushed down
2976    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2977    assembly performance the user should preallocate the matrix storage by setting
2978    the parameter nz (or the array nnz).  By setting these parameters accurately,
2979    performance during matrix assembly can be increased by more than a factor of 50.
2980 
2981    Collective
2982 
2983    Input Parameters:
2984 +  comm - MPI communicator, set to PETSC_COMM_SELF
2985 .  m - number of rows
2986 .  n - number of columns
2987 .  nz - number of nonzeros per row (same for all rows)
2988 -  nnz - array containing the number of nonzeros in the various rows
2989          (possibly different for each row) or NULL
2990 
2991    Output Parameter:
2992 .  A - the matrix
2993 
2994    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2995    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2996    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2997 
2998    Notes:
2999    If nnz is given then nz is ignored
3000 
3001    The AIJ format (also called the Yale sparse matrix format or
3002    compressed row storage), is fully compatible with standard Fortran 77
3003    storage.  That is, the stored row and column indices can begin at
3004    either one (as in Fortran) or zero.  See the users' manual for details.
3005 
3006    Specify the preallocated storage with either nz or nnz (not both).
3007    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3008    allocation.  For large problems you MUST preallocate memory or you
3009    will get TERRIBLE performance, see the users' manual chapter on matrices.
3010 
3011    By default, this format uses inodes (identical nodes) when possible, to
3012    improve numerical efficiency of matrix-vector products and solves. We
3013    search for consecutive rows with the same nonzero structure, thereby
3014    reusing matrix information to achieve increased efficiency.
3015 
3016    Level: intermediate
3017 
3018 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3019 @*/
3020 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3021 {
3022   PetscErrorCode ierr;
3023 
3024   PetscFunctionBegin;
3025   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3026   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3027   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3028   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3029   PetscFunctionReturn(0);
3030 }
3031 
3032 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3033 {
3034   PetscErrorCode              ierr;
3035   PetscSplitCSRDataStructure  *d_mat = NULL;
3036 
3037   PetscFunctionBegin;
3038   if (A->factortype == MAT_FACTOR_NONE) {
3039     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3040     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3041     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3042   } else {
3043     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3044   }
3045   if (d_mat) {
3046     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3047     cudaError_t                err;
3048     PetscSplitCSRDataStructure h_mat;
3049     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3050     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3051     if (a->compressedrow.use) {
3052       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3053     }
3054     err = cudaFree(d_mat);CHKERRCUDA(err);
3055   }
3056   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3057   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3058   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3059   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3060   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3061   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3062   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3063   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3064   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3065   PetscFunctionReturn(0);
3066 }
3067 
3068 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3069 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3070 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3071 {
3072   PetscErrorCode ierr;
3073 
3074   PetscFunctionBegin;
3075   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3076   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3077   PetscFunctionReturn(0);
3078 }
3079 
3080 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3081 {
3082   PetscErrorCode     ierr;
3083   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3084   Mat_SeqAIJCUSPARSE *cy;
3085   Mat_SeqAIJCUSPARSE *cx;
3086   PetscScalar        *ay;
3087   const PetscScalar  *ax;
3088   CsrMatrix          *csry,*csrx;
3089   cudaError_t        cerr;
3090 
3091   PetscFunctionBegin;
3092   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3093   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3094   if (X->ops->axpy != Y->ops->axpy) {
3095     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3096     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3097     PetscFunctionReturn(0);
3098   }
3099   /* if we are here, it means both matrices are bound to GPU */
3100   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3101   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3102   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3103   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3104   csry = (CsrMatrix*)cy->mat->mat;
3105   csrx = (CsrMatrix*)cx->mat->mat;
3106   /* see if we can turn this into a cublas axpy */
3107   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3108     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3109     if (eq) {
3110       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3111     }
3112     if (eq) str = SAME_NONZERO_PATTERN;
3113   }
3114   /* spgeam is buggy with one column */
3115   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3116 
3117   if (str == SUBSET_NONZERO_PATTERN) {
3118     cusparseStatus_t stat;
3119     PetscScalar      b = 1.0;
3120 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3121     size_t           bufferSize;
3122     void             *buffer;
3123 #endif
3124 
3125     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3126     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3127     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3128 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3129     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3130                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3131                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3132                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3133     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3134     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3135     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3136                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3137                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3138                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3139     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3140     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3141     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3142     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3143 #else
3144     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3145     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3146                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3147                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3148                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3149     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3150     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3151     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3152 #endif
3153     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3154     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3155     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3156     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3157   } else if (str == SAME_NONZERO_PATTERN) {
3158     cublasHandle_t cublasv2handle;
3159     cublasStatus_t berr;
3160     PetscBLASInt   one = 1, bnz = 1;
3161 
3162     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3163     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3164     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3165     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3166     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3167     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3168     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3169     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3170     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3171     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3172     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3173     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3174   } else {
3175     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3176     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3177   }
3178   PetscFunctionReturn(0);
3179 }
3180 
3181 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3182 {
3183   PetscErrorCode ierr;
3184   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3185   PetscScalar    *ay;
3186   cudaError_t    cerr;
3187   cublasHandle_t cublasv2handle;
3188   cublasStatus_t berr;
3189   PetscBLASInt   one = 1, bnz = 1;
3190 
3191   PetscFunctionBegin;
3192   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3193   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3194   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3195   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3196   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3197   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3198   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3199   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3200   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3201   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3202   PetscFunctionReturn(0);
3203 }
3204 
3205 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3206 {
3207   PetscErrorCode             ierr;
3208   PetscBool                  both = PETSC_FALSE;
3209   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3210 
3211   PetscFunctionBegin;
3212   if (A->factortype == MAT_FACTOR_NONE) {
3213     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3214     if (spptr->mat) {
3215       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3216       if (matrix->values) {
3217         both = PETSC_TRUE;
3218         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3219       }
3220     }
3221     if (spptr->matTranspose) {
3222       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3223       if (matrix->values) {
3224         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3225       }
3226     }
3227   }
3228   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3229   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3230   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3231   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3232   else A->offloadmask = PETSC_OFFLOAD_CPU;
3233 
3234   PetscFunctionReturn(0);
3235 }
3236 
3237 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3238 {
3239   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3240   PetscErrorCode ierr;
3241 
3242   PetscFunctionBegin;
3243   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3244   if (flg) {
3245     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3246 
3247     A->ops->scale                     = MatScale_SeqAIJ;
3248     A->ops->axpy                      = MatAXPY_SeqAIJ;
3249     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3250     A->ops->mult                      = MatMult_SeqAIJ;
3251     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3252     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3253     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3254     A->ops->multhermitiantranspose    = NULL;
3255     A->ops->multhermitiantransposeadd = NULL;
3256     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3257     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3258     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3259     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3260     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3261     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3262     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3263     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3264   } else {
3265     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3266     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3267     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3268     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3269     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3270     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3271     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3272     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3273     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3274     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3275     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3276     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3277     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3278     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3279     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3280     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3281     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3282   }
3283   A->boundtocpu = flg;
3284   a->inode.use = flg;
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3289 {
3290   PetscErrorCode   ierr;
3291   cusparseStatus_t stat;
3292   Mat              B;
3293 
3294   PetscFunctionBegin;
3295   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3296   if (reuse == MAT_INITIAL_MATRIX) {
3297     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3298   } else if (reuse == MAT_REUSE_MATRIX) {
3299     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3300   }
3301   B = *newmat;
3302 
3303   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3304   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3305 
3306   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3307     if (B->factortype == MAT_FACTOR_NONE) {
3308       Mat_SeqAIJCUSPARSE *spptr;
3309       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3310       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3311       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3312       spptr->format     = MAT_CUSPARSE_CSR;
3313      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3314       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3315       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3316       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3317      #endif
3318       B->spptr = spptr;
3319     } else {
3320       Mat_SeqAIJCUSPARSETriFactors *spptr;
3321 
3322       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3323       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3324       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3325       B->spptr = spptr;
3326     }
3327     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3328   }
3329   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3330   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3331   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3332   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3333   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3334   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3335 
3336   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3337   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3338   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3343 {
3344   PetscErrorCode ierr;
3345 
3346   PetscFunctionBegin;
3347   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3348   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3349   PetscFunctionReturn(0);
3350 }
3351 
3352 /*MC
3353    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3354 
3355    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3356    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3357    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3358 
3359    Options Database Keys:
3360 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3361 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3362 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3363 
3364   Level: beginner
3365 
3366 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3367 M*/
3368 
3369 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3370 
3371 
3372 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3373 {
3374   PetscErrorCode ierr;
3375 
3376   PetscFunctionBegin;
3377   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3378   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3379   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3380   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3381   PetscFunctionReturn(0);
3382 }
3383 
3384 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3385 {
3386   PetscErrorCode   ierr;
3387   cusparseStatus_t stat;
3388 
3389   PetscFunctionBegin;
3390   if (*cusparsestruct) {
3391     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3392     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3393     delete (*cusparsestruct)->workVector;
3394     delete (*cusparsestruct)->rowoffsets_gpu;
3395     delete (*cusparsestruct)->cooPerm;
3396     delete (*cusparsestruct)->cooPerm_a;
3397     delete (*cusparsestruct)->csr2csc_i;
3398     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3399     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3400   }
3401   PetscFunctionReturn(0);
3402 }
3403 
3404 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3405 {
3406   PetscFunctionBegin;
3407   if (*mat) {
3408     delete (*mat)->values;
3409     delete (*mat)->column_indices;
3410     delete (*mat)->row_offsets;
3411     delete *mat;
3412     *mat = 0;
3413   }
3414   PetscFunctionReturn(0);
3415 }
3416 
3417 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3418 {
3419   cusparseStatus_t stat;
3420   PetscErrorCode   ierr;
3421 
3422   PetscFunctionBegin;
3423   if (*trifactor) {
3424     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3425     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3426     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3427     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3428     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3429    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3430     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3431    #endif
3432     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3433   }
3434   PetscFunctionReturn(0);
3435 }
3436 
3437 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3438 {
3439   CsrMatrix        *mat;
3440   cusparseStatus_t stat;
3441   cudaError_t      err;
3442 
3443   PetscFunctionBegin;
3444   if (*matstruct) {
3445     if ((*matstruct)->mat) {
3446       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3447        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3448         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3449        #else
3450         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3451         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3452        #endif
3453       } else {
3454         mat = (CsrMatrix*)(*matstruct)->mat;
3455         CsrMatrix_Destroy(&mat);
3456       }
3457     }
3458     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3459     delete (*matstruct)->cprowIndices;
3460     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3461     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3462     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3463 
3464    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3465     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3466     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3467     for (int i=0; i<3; i++) {
3468       if (mdata->cuSpMV[i].initialized) {
3469         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3470         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3471         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3472       }
3473     }
3474    #endif
3475     delete *matstruct;
3476     *matstruct = NULL;
3477   }
3478   PetscFunctionReturn(0);
3479 }
3480 
3481 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3482 {
3483   PetscErrorCode ierr;
3484 
3485   PetscFunctionBegin;
3486   if (*trifactors) {
3487     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3488     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3489     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3490     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3491     delete (*trifactors)->rpermIndices;
3492     delete (*trifactors)->cpermIndices;
3493     delete (*trifactors)->workVector;
3494     (*trifactors)->rpermIndices = NULL;
3495     (*trifactors)->cpermIndices = NULL;
3496     (*trifactors)->workVector = NULL;
3497   }
3498   PetscFunctionReturn(0);
3499 }
3500 
3501 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3502 {
3503   PetscErrorCode   ierr;
3504   cusparseHandle_t handle;
3505   cusparseStatus_t stat;
3506 
3507   PetscFunctionBegin;
3508   if (*trifactors) {
3509     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3510     if (handle = (*trifactors)->handle) {
3511       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3512     }
3513     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3514   }
3515   PetscFunctionReturn(0);
3516 }
3517 
3518 struct IJCompare
3519 {
3520   __host__ __device__
3521   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3522   {
3523     if (t1.get<0>() < t2.get<0>()) return true;
3524     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3525     return false;
3526   }
3527 };
3528 
3529 struct IJEqual
3530 {
3531   __host__ __device__
3532   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3533   {
3534     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3535     return true;
3536   }
3537 };
3538 
3539 struct IJDiff
3540 {
3541   __host__ __device__
3542   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3543   {
3544     return t1 == t2 ? 0 : 1;
3545   }
3546 };
3547 
3548 struct IJSum
3549 {
3550   __host__ __device__
3551   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3552   {
3553     return t1||t2;
3554   }
3555 };
3556 
3557 #include <thrust/iterator/discard_iterator.h>
3558 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3559 {
3560   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3561   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3562   THRUSTARRAY                           *cooPerm_v = NULL;
3563   thrust::device_ptr<const PetscScalar> d_v;
3564   CsrMatrix                             *matrix;
3565   PetscErrorCode                        ierr;
3566   cudaError_t                           cerr;
3567   PetscInt                              n;
3568 
3569   PetscFunctionBegin;
3570   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3571   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3572   if (!cusp->cooPerm) {
3573     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3574     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3575     PetscFunctionReturn(0);
3576   }
3577   matrix = (CsrMatrix*)cusp->mat->mat;
3578   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3579   if (!v) {
3580     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3581     goto finalize;
3582   }
3583   n = cusp->cooPerm->size();
3584   if (isCudaMem(v)) {
3585     d_v = thrust::device_pointer_cast(v);
3586   } else {
3587     cooPerm_v = new THRUSTARRAY(n);
3588     cooPerm_v->assign(v,v+n);
3589     d_v = cooPerm_v->data();
3590     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3591   }
3592   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3593   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3594     if (cusp->cooPerm_a) {
3595       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3596       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3597       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3598       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3599       delete cooPerm_w;
3600     } else {
3601       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3602                                                                 matrix->values->begin()));
3603       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3604                                                                 matrix->values->end()));
3605       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3606     }
3607   } else {
3608     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3609       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3610       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3611     } else {
3612       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3613                                                                 matrix->values->begin()));
3614       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3615                                                                 matrix->values->end()));
3616       thrust::for_each(zibit,zieit,VecCUDAEquals());
3617     }
3618   }
3619   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3620   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3621 finalize:
3622   delete cooPerm_v;
3623   A->offloadmask = PETSC_OFFLOAD_GPU;
3624   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3625   /* shorter version of MatAssemblyEnd_SeqAIJ */
3626   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3627   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3628   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3629   a->reallocs         = 0;
3630   A->info.mallocs    += 0;
3631   A->info.nz_unneeded = 0;
3632   A->assembled = A->was_assembled = PETSC_TRUE;
3633   A->num_ass++;
3634   PetscFunctionReturn(0);
3635 }
3636 
3637 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3638 {
3639   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3640   PetscErrorCode     ierr;
3641 
3642   PetscFunctionBegin;
3643   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3644   if (!cusp) PetscFunctionReturn(0);
3645   if (destroy) {
3646     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3647     delete cusp->csr2csc_i;
3648     cusp->csr2csc_i = NULL;
3649   }
3650   A->transupdated = PETSC_FALSE;
3651   PetscFunctionReturn(0);
3652 }
3653 
3654 #include <thrust/binary_search.h>
3655 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3656 {
3657   PetscErrorCode     ierr;
3658   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3659   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3660   PetscInt           cooPerm_n, nzr = 0;
3661   cudaError_t        cerr;
3662 
3663   PetscFunctionBegin;
3664   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3665   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3666   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3667   if (n != cooPerm_n) {
3668     delete cusp->cooPerm;
3669     delete cusp->cooPerm_a;
3670     cusp->cooPerm = NULL;
3671     cusp->cooPerm_a = NULL;
3672   }
3673   if (n) {
3674     THRUSTINTARRAY d_i(n);
3675     THRUSTINTARRAY d_j(n);
3676     THRUSTINTARRAY ii(A->rmap->n);
3677 
3678     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3679     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3680 
3681     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3682     d_i.assign(coo_i,coo_i+n);
3683     d_j.assign(coo_j,coo_j+n);
3684     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3685     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3686 
3687     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3688     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3689     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3690     *cusp->cooPerm_a = d_i;
3691     THRUSTINTARRAY w = d_j;
3692 
3693     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3694     if (nekey == ekey) { /* all entries are unique */
3695       delete cusp->cooPerm_a;
3696       cusp->cooPerm_a = NULL;
3697     } else { /* I couldn't come up with a more elegant algorithm */
3698       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3699       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3700       (*cusp->cooPerm_a)[0] = 0;
3701       w[0] = 0;
3702       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3703       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3704     }
3705     thrust::counting_iterator<PetscInt> search_begin(0);
3706     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3707                         search_begin, search_begin + A->rmap->n,
3708                         ii.begin());
3709     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3710     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3711 
3712     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3713     a->singlemalloc = PETSC_FALSE;
3714     a->free_a       = PETSC_TRUE;
3715     a->free_ij      = PETSC_TRUE;
3716     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3717     a->i[0] = 0;
3718     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3719     a->nz = a->maxnz = a->i[A->rmap->n];
3720     a->rmax = 0;
3721     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3722     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3723     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3724     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3725     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3726     for (PetscInt i = 0; i < A->rmap->n; i++) {
3727       const PetscInt nnzr = a->i[i+1] - a->i[i];
3728       nzr += (PetscInt)!!(nnzr);
3729       a->ilen[i] = a->imax[i] = nnzr;
3730       a->rmax = PetscMax(a->rmax,nnzr);
3731     }
3732     a->nonzerorowcnt = nzr;
3733     A->preallocated = PETSC_TRUE;
3734     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3735     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3736   } else {
3737     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3738   }
3739   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3740 
3741   /* We want to allocate the CUSPARSE struct for matvec now.
3742      The code is so convoluted now that I prefer to copy zeros */
3743   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3744   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3745   A->offloadmask = PETSC_OFFLOAD_CPU;
3746   A->nonzerostate++;
3747   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3748   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3749 
3750   A->assembled = PETSC_FALSE;
3751   A->was_assembled = PETSC_FALSE;
3752   PetscFunctionReturn(0);
3753 }
3754 
3755 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3756 {
3757   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3758   CsrMatrix          *csr;
3759   PetscErrorCode     ierr;
3760 
3761   PetscFunctionBegin;
3762   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3763   PetscValidPointer(a,2);
3764   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3765   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3766   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3767   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3768   csr = (CsrMatrix*)cusp->mat->mat;
3769   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3770   *a = csr->values->data().get();
3771   PetscFunctionReturn(0);
3772 }
3773 
3774 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3775 {
3776   PetscFunctionBegin;
3777   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3778   PetscValidPointer(a,2);
3779   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3780   *a = NULL;
3781   PetscFunctionReturn(0);
3782 }
3783 
3784 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3785 {
3786   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3787   CsrMatrix          *csr;
3788   PetscErrorCode     ierr;
3789 
3790   PetscFunctionBegin;
3791   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3792   PetscValidPointer(a,2);
3793   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3794   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3795   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3796   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3797   csr = (CsrMatrix*)cusp->mat->mat;
3798   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3799   *a = csr->values->data().get();
3800   A->offloadmask = PETSC_OFFLOAD_GPU;
3801   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3802   PetscFunctionReturn(0);
3803 }
3804 
3805 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3806 {
3807   PetscErrorCode ierr;
3808 
3809   PetscFunctionBegin;
3810   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3811   PetscValidPointer(a,2);
3812   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3813   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3814   *a = NULL;
3815   PetscFunctionReturn(0);
3816 }
3817 
3818 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3819 {
3820   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3821   CsrMatrix          *csr;
3822   PetscErrorCode     ierr;
3823 
3824   PetscFunctionBegin;
3825   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3826   PetscValidPointer(a,2);
3827   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3828   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3829   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3830   csr = (CsrMatrix*)cusp->mat->mat;
3831   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3832   *a = csr->values->data().get();
3833   A->offloadmask = PETSC_OFFLOAD_GPU;
3834   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3835   PetscFunctionReturn(0);
3836 }
3837 
3838 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3839 {
3840   PetscErrorCode ierr;
3841 
3842   PetscFunctionBegin;
3843   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3844   PetscValidPointer(a,2);
3845   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3846   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3847   *a = NULL;
3848   PetscFunctionReturn(0);
3849 }
3850 
3851 struct IJCompare4
3852 {
3853   __host__ __device__
3854   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3855   {
3856     if (t1.get<0>() < t2.get<0>()) return true;
3857     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3858     return false;
3859   }
3860 };
3861 
3862 struct Shift
3863 {
3864   int _shift;
3865 
3866   Shift(int shift) : _shift(shift) {}
3867   __host__ __device__
3868   inline int operator() (const int &c)
3869   {
3870     return c + _shift;
3871   }
3872 };
3873 
3874 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3875 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3876 {
3877   PetscErrorCode               ierr;
3878   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3879   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3880   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3881   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3882   PetscInt                     Annz,Bnnz;
3883   cusparseStatus_t             stat;
3884   PetscInt                     i,m,n,zero = 0;
3885   cudaError_t                  cerr;
3886 
3887   PetscFunctionBegin;
3888   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3889   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3890   PetscValidPointer(C,4);
3891   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3892   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3893   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3894   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3895   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3896   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3897   if (reuse == MAT_INITIAL_MATRIX) {
3898     m     = A->rmap->n;
3899     n     = A->cmap->n + B->cmap->n;
3900     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3901     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3902     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3903     c     = (Mat_SeqAIJ*)(*C)->data;
3904     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3905     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3906     Ccsr  = new CsrMatrix;
3907     Cmat->cprowIndices      = NULL;
3908     c->compressedrow.use    = PETSC_FALSE;
3909     c->compressedrow.nrows  = 0;
3910     c->compressedrow.i      = NULL;
3911     c->compressedrow.rindex = NULL;
3912     Ccusp->workVector       = NULL;
3913     Ccusp->nrows    = m;
3914     Ccusp->mat      = Cmat;
3915     Ccusp->mat->mat = Ccsr;
3916     Ccsr->num_rows  = m;
3917     Ccsr->num_cols  = n;
3918     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3919     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3920     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3921     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3922     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3923     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3924     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3925     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3926     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3927     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3928     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3929     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3930     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3931     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3932     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3933 
3934     Acsr = (CsrMatrix*)Acusp->mat->mat;
3935     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3936     Annz = (PetscInt)Acsr->column_indices->size();
3937     Bnnz = (PetscInt)Bcsr->column_indices->size();
3938     c->nz = Annz + Bnnz;
3939     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3940     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3941     Ccsr->values = new THRUSTARRAY(c->nz);
3942     Ccsr->num_entries = c->nz;
3943     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3944     if (c->nz) {
3945       auto Acoo = new THRUSTINTARRAY32(Annz);
3946       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3947       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3948       THRUSTINTARRAY32 *Aroff,*Broff;
3949 
3950       if (a->compressedrow.use) { /* need full row offset */
3951         if (!Acusp->rowoffsets_gpu) {
3952           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3953           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3954           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3955         }
3956         Aroff = Acusp->rowoffsets_gpu;
3957       } else Aroff = Acsr->row_offsets;
3958       if (b->compressedrow.use) { /* need full row offset */
3959         if (!Bcusp->rowoffsets_gpu) {
3960           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3961           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3962           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3963         }
3964         Broff = Bcusp->rowoffsets_gpu;
3965       } else Broff = Bcsr->row_offsets;
3966       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3967       stat = cusparseXcsr2coo(Acusp->handle,
3968                               Aroff->data().get(),
3969                               Annz,
3970                               m,
3971                               Acoo->data().get(),
3972                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3973       stat = cusparseXcsr2coo(Bcusp->handle,
3974                               Broff->data().get(),
3975                               Bnnz,
3976                               m,
3977                               Bcoo->data().get(),
3978                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3979       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3980       auto Aperm = thrust::make_constant_iterator(1);
3981       auto Bperm = thrust::make_constant_iterator(0);
3982 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3983       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3984       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3985 #else
3986       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3987       auto Bcib = Bcsr->column_indices->begin();
3988       auto Bcie = Bcsr->column_indices->end();
3989       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3990 #endif
3991       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3992       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3993       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3994       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3995       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3996       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3997       auto p1 = Ccusp->cooPerm->begin();
3998       auto p2 = Ccusp->cooPerm->begin();
3999       thrust::advance(p2,Annz);
4000       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4001 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4002       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4003 #endif
4004       auto cci = thrust::make_counting_iterator(zero);
4005       auto cce = thrust::make_counting_iterator(c->nz);
4006 #if 0 //Errors on SUMMIT cuda 11.1.0
4007       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4008 #else
4009       auto pred = thrust::identity<int>();
4010       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4011       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4012 #endif
4013       stat = cusparseXcoo2csr(Ccusp->handle,
4014                               Ccoo->data().get(),
4015                               c->nz,
4016                               m,
4017                               Ccsr->row_offsets->data().get(),
4018                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4019       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4020       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4021       delete wPerm;
4022       delete Acoo;
4023       delete Bcoo;
4024       delete Ccoo;
4025 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4026       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4027                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4028                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4029                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4030 #endif
4031       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4032         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4033         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4034         CsrMatrix *CcsrT = new CsrMatrix;
4035         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4036         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4037 
4038         (*C)->form_explicit_transpose = PETSC_TRUE;
4039         (*C)->transupdated = PETSC_TRUE;
4040         Ccusp->rowoffsets_gpu = NULL;
4041         CmatT->cprowIndices = NULL;
4042         CmatT->mat = CcsrT;
4043         CcsrT->num_rows = n;
4044         CcsrT->num_cols = m;
4045         CcsrT->num_entries = c->nz;
4046 
4047         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4048         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4049         CcsrT->values = new THRUSTARRAY(c->nz);
4050 
4051         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4052         auto rT = CcsrT->row_offsets->begin();
4053         if (AT) {
4054           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4055           thrust::advance(rT,-1);
4056         }
4057         if (BT) {
4058           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4059           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4060           thrust::copy(titb,tite,rT);
4061         }
4062         auto cT = CcsrT->column_indices->begin();
4063         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4064         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4065         auto vT = CcsrT->values->begin();
4066         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4067         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4068         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4069         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4070 
4071         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4072         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4073         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4074         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4075         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4076         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4077         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4078         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4079         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4080 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4081         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4082                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4083                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4084                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4085 #endif
4086         Ccusp->matTranspose = CmatT;
4087       }
4088     }
4089 
4090     c->singlemalloc = PETSC_FALSE;
4091     c->free_a       = PETSC_TRUE;
4092     c->free_ij      = PETSC_TRUE;
4093     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4094     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4095     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4096       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4097       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4098       ii   = *Ccsr->row_offsets;
4099       jj   = *Ccsr->column_indices;
4100       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4101       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4102     } else {
4103       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4104       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4105     }
4106     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4107     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4108     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4109     c->maxnz = c->nz;
4110     c->nonzerorowcnt = 0;
4111     c->rmax = 0;
4112     for (i = 0; i < m; i++) {
4113       const PetscInt nn = c->i[i+1] - c->i[i];
4114       c->ilen[i] = c->imax[i] = nn;
4115       c->nonzerorowcnt += (PetscInt)!!nn;
4116       c->rmax = PetscMax(c->rmax,nn);
4117     }
4118     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4119     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4120     (*C)->nonzerostate++;
4121     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4122     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4123     Ccusp->nonzerostate = (*C)->nonzerostate;
4124     (*C)->preallocated  = PETSC_TRUE;
4125   } else {
4126     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4127     c = (Mat_SeqAIJ*)(*C)->data;
4128     if (c->nz) {
4129       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4130       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4131       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4132       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4133       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4134       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4135       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4136       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4137       Acsr = (CsrMatrix*)Acusp->mat->mat;
4138       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4139       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4140       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4141       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4142       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4143       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4144       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4145       auto pmid = Ccusp->cooPerm->begin();
4146       thrust::advance(pmid,Acsr->num_entries);
4147       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4148       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4149                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4150       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4151                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4152       thrust::for_each(zibait,zieait,VecCUDAEquals());
4153       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4154                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4155       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4156                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4157       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4158       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4159       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4160         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4161         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4162         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4163         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4164         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4165         auto vT = CcsrT->values->begin();
4166         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4167         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4168         (*C)->transupdated = PETSC_TRUE;
4169       }
4170       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4171       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4172     }
4173   }
4174   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4175   (*C)->assembled     = PETSC_TRUE;
4176   (*C)->was_assembled = PETSC_FALSE;
4177   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4178   PetscFunctionReturn(0);
4179 }
4180 
4181 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4182 {
4183   PetscErrorCode    ierr;
4184   bool              dmem;
4185   const PetscScalar *av;
4186   cudaError_t       cerr;
4187 
4188   PetscFunctionBegin;
4189   dmem = isCudaMem(v);
4190   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4191   if (n && idx) {
4192     THRUSTINTARRAY widx(n);
4193     widx.assign(idx,idx+n);
4194     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4195 
4196     THRUSTARRAY *w = NULL;
4197     thrust::device_ptr<PetscScalar> dv;
4198     if (dmem) {
4199       dv = thrust::device_pointer_cast(v);
4200     } else {
4201       w = new THRUSTARRAY(n);
4202       dv = w->data();
4203     }
4204     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4205 
4206     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4207     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4208     thrust::for_each(zibit,zieit,VecCUDAEquals());
4209     if (w) {
4210       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4211     }
4212     delete w;
4213   } else {
4214     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4215   }
4216   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4217   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4218   PetscFunctionReturn(0);
4219 }
4220