xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d0a29bd77466da32e237a4ee4d6c05447bce64bd)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 
18 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22 
23   typedef enum {
24       CUSPARSE_MV_ALG_DEFAULT = 0,
25       CUSPARSE_COOMV_ALG      = 1,
26       CUSPARSE_CSRMV_ALG1     = 2,
27       CUSPARSE_CSRMV_ALG2     = 3
28   } cusparseSpMVAlg_t;
29 
30   typedef enum {
31       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37       CUSPARSE_SPMM_COO_ALG1    = 1,
38       CUSPARSE_SPMM_COO_ALG2    = 2,
39       CUSPARSE_SPMM_COO_ALG3    = 3,
40       CUSPARSE_SPMM_COO_ALG4    = 5,
41       CUSPARSE_SPMM_CSR_ALG1    = 4,
42       CUSPARSE_SPMM_CSR_ALG2    = 6,
43   } cusparseSpMMAlg_t;
44 
45   typedef enum {
46       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48   } cusparseCsr2CscAlg_t;
49   */
50   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53 #endif
54 
55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58 
59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77 
78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
83 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
84 
85 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
86 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
87 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
88 
89 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
90 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93 
94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95 {
96   cusparseStatus_t   stat;
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   cusparseStatus_t   stat;
109   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110 
111   PetscFunctionBegin;
112   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
113   if (cusparsestruct->handle != handle) {
114     if (cusparsestruct->handle) {
115       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
116     }
117     cusparsestruct->handle = handle;
118   }
119   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120   PetscFunctionReturn(0);
121 }
122 
123 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126   PetscBool          flg;
127   PetscErrorCode     ierr;
128 
129   PetscFunctionBegin;
130   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
131   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133   PetscFunctionReturn(0);
134 }
135 
136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
137 {
138   PetscFunctionBegin;
139   *type = MATSOLVERCUSPARSE;
140   PetscFunctionReturn(0);
141 }
142 
143 /*MC
144   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149   algorithms are not recommended. This class does NOT support direct solver operations.
150 
151   Level: beginner
152 
153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154 M*/
155 
156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
157 {
158   PetscErrorCode ierr;
159   PetscInt       n = A->rmap->n;
160 
161   PetscFunctionBegin;
162   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
164   (*B)->factortype = ftype;
165   (*B)->useordering = PETSC_TRUE;
166   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167 
168   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
169     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
170     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
171     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
176 
177   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
178   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
183 {
184   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
185 
186   PetscFunctionBegin;
187   switch (op) {
188   case MAT_CUSPARSE_MULT:
189     cusparsestruct->format = format;
190     break;
191   case MAT_CUSPARSE_ALL:
192     cusparsestruct->format = format;
193     break;
194   default:
195     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /*@
201    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
202    operation. Only the MatMult operation can use different GPU storage formats
203    for MPIAIJCUSPARSE matrices.
204    Not Collective
205 
206    Input Parameters:
207 +  A - Matrix of type SEQAIJCUSPARSE
208 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
209 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
210 
211    Output Parameter:
212 
213    Level: intermediate
214 
215 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
216 @*/
217 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
223   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
224   PetscFunctionReturn(0);
225 }
226 
227 /*@
228    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
229 
230    Collective on mat
231 
232    Input Parameters:
233 +  A - Matrix of type SEQAIJCUSPARSE
234 -  transgen - the boolean flag
235 
236    Level: intermediate
237 
238 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
239 @*/
240 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
241 {
242   PetscErrorCode ierr;
243   PetscBool      flg;
244 
245   PetscFunctionBegin;
246   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
247   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
248   if (flg) {
249     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
250 
251     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
252     cusp->transgen = transgen;
253     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
254       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
255     }
256   }
257   PetscFunctionReturn(0);
258 }
259 
260 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
261 {
262   PetscErrorCode           ierr;
263   MatCUSPARSEStorageFormat format;
264   PetscBool                flg;
265   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
266 
267   PetscFunctionBegin;
268   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
269   if (A->factortype == MAT_FACTOR_NONE) {
270     PetscBool transgen = cusparsestruct->transgen;
271 
272     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
273     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
274 
275     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
276                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
277     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
278 
279     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
280                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
281     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
282    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
283     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
284                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
285     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
286     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
287 
288     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
289                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
290     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
291 
292     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
293                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
294     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
295    #endif
296   }
297   ierr = PetscOptionsTail();CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
302 {
303   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
304   PetscErrorCode               ierr;
305 
306   PetscFunctionBegin;
307   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
308   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
309   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
310   PetscFunctionReturn(0);
311 }
312 
313 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
314 {
315   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
316   PetscErrorCode               ierr;
317 
318   PetscFunctionBegin;
319   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
320   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
321   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326 {
327   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
328   PetscErrorCode               ierr;
329 
330   PetscFunctionBegin;
331   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
332   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
333   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
338 {
339   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
340   PetscErrorCode               ierr;
341 
342   PetscFunctionBegin;
343   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
344   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
345   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
346   PetscFunctionReturn(0);
347 }
348 
349 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
350 {
351   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
352   PetscInt                          n = A->rmap->n;
353   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
354   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
355   cusparseStatus_t                  stat;
356   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
357   const MatScalar                   *aa = a->a,*v;
358   PetscInt                          *AiLo, *AjLo;
359   PetscInt                          i,nz, nzLower, offset, rowOffset;
360   PetscErrorCode                    ierr;
361   cudaError_t                       cerr;
362 
363   PetscFunctionBegin;
364   if (!n) PetscFunctionReturn(0);
365   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
366     try {
367       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
368       nzLower=n+ai[n]-ai[1];
369       if (!loTriFactor) {
370         PetscScalar                       *AALo;
371 
372         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
373 
374         /* Allocate Space for the lower triangular matrix */
375         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
376         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
377 
378         /* Fill the lower triangular matrix */
379         AiLo[0]  = (PetscInt) 0;
380         AiLo[n]  = nzLower;
381         AjLo[0]  = (PetscInt) 0;
382         AALo[0]  = (MatScalar) 1.0;
383         v        = aa;
384         vi       = aj;
385         offset   = 1;
386         rowOffset= 1;
387         for (i=1; i<n; i++) {
388           nz = ai[i+1] - ai[i];
389           /* additional 1 for the term on the diagonal */
390           AiLo[i]    = rowOffset;
391           rowOffset += nz+1;
392 
393           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
394           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
395 
396           offset      += nz;
397           AjLo[offset] = (PetscInt) i;
398           AALo[offset] = (MatScalar) 1.0;
399           offset      += 1;
400 
401           v  += nz;
402           vi += nz;
403         }
404 
405         /* allocate space for the triangular factor information */
406         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
407         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
408         /* Create the matrix description */
409         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
410         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
411        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
413        #else
414         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
415        #endif
416         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
417         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
418 
419         /* set the operation */
420         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421 
422         /* set the matrix */
423         loTriFactor->csrMat = new CsrMatrix;
424         loTriFactor->csrMat->num_rows = n;
425         loTriFactor->csrMat->num_cols = n;
426         loTriFactor->csrMat->num_entries = nzLower;
427 
428         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
430 
431         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
432         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
433 
434         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
435         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
436 
437         /* Create the solve analysis information */
438         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
439         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
440       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
441         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
442                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
443                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
444                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
445                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
446         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
447       #endif
448 
449         /* perform the solve analysis */
450         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
451                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
452                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
453                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
454                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
455                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
456                                #endif
457 );CHKERRCUSPARSE(stat);
458         cerr = WaitForCUDA();CHKERRCUDA(cerr);
459         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
460 
461         /* assign the pointer */
462         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
463         loTriFactor->AA_h = AALo;
464         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
465         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
466         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
467       } else { /* update values only */
468         if (!loTriFactor->AA_h) {
469           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
470         }
471         /* Fill the lower triangular matrix */
472         loTriFactor->AA_h[0]  = 1.0;
473         v        = aa;
474         vi       = aj;
475         offset   = 1;
476         for (i=1; i<n; i++) {
477           nz = ai[i+1] - ai[i];
478           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
479           offset      += nz;
480           loTriFactor->AA_h[offset] = 1.0;
481           offset      += 1;
482           v  += nz;
483         }
484         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
485         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
486       }
487     } catch(char *ex) {
488       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
489     }
490   }
491   PetscFunctionReturn(0);
492 }
493 
494 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
495 {
496   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
497   PetscInt                          n = A->rmap->n;
498   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
499   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
500   cusparseStatus_t                  stat;
501   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
502   const MatScalar                   *aa = a->a,*v;
503   PetscInt                          *AiUp, *AjUp;
504   PetscInt                          i,nz, nzUpper, offset;
505   PetscErrorCode                    ierr;
506   cudaError_t                       cerr;
507 
508   PetscFunctionBegin;
509   if (!n) PetscFunctionReturn(0);
510   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
511     try {
512       /* next, figure out the number of nonzeros in the upper triangular matrix. */
513       nzUpper = adiag[0]-adiag[n];
514       if (!upTriFactor) {
515         PetscScalar *AAUp;
516 
517         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
518 
519         /* Allocate Space for the upper triangular matrix */
520         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
521         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
522 
523         /* Fill the upper triangular matrix */
524         AiUp[0]=(PetscInt) 0;
525         AiUp[n]=nzUpper;
526         offset = nzUpper;
527         for (i=n-1; i>=0; i--) {
528           v  = aa + adiag[i+1] + 1;
529           vi = aj + adiag[i+1] + 1;
530 
531           /* number of elements NOT on the diagonal */
532           nz = adiag[i] - adiag[i+1]-1;
533 
534           /* decrement the offset */
535           offset -= (nz+1);
536 
537           /* first, set the diagonal elements */
538           AjUp[offset] = (PetscInt) i;
539           AAUp[offset] = (MatScalar)1./v[nz];
540           AiUp[i]      = AiUp[i+1] - (nz+1);
541 
542           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
543           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
544         }
545 
546         /* allocate space for the triangular factor information */
547         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
548         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
549 
550         /* Create the matrix description */
551         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
552         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
553        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
555        #else
556         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
557        #endif
558         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
559         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
560 
561         /* set the operation */
562         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
563 
564         /* set the matrix */
565         upTriFactor->csrMat = new CsrMatrix;
566         upTriFactor->csrMat->num_rows = n;
567         upTriFactor->csrMat->num_cols = n;
568         upTriFactor->csrMat->num_entries = nzUpper;
569 
570         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
571         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
572 
573         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
574         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
575 
576         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
577         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
578 
579         /* Create the solve analysis information */
580         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
581         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
582       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
583         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
584                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
585                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
586                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
587                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
588         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
589       #endif
590 
591         /* perform the solve analysis */
592         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
596                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
597                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
598                                #endif
599 );CHKERRCUSPARSE(stat);
600         cerr = WaitForCUDA();CHKERRCUDA(cerr);
601         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602 
603         /* assign the pointer */
604         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
605         upTriFactor->AA_h = AAUp;
606         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
607         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
608         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
609       } else {
610         if (!upTriFactor->AA_h) {
611           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
612         }
613         /* Fill the upper triangular matrix */
614         offset = nzUpper;
615         for (i=n-1; i>=0; i--) {
616           v  = aa + adiag[i+1] + 1;
617 
618           /* number of elements NOT on the diagonal */
619           nz = adiag[i] - adiag[i+1]-1;
620 
621           /* decrement the offset */
622           offset -= (nz+1);
623 
624           /* first, set the diagonal elements */
625           upTriFactor->AA_h[offset] = 1./v[nz];
626           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
627         }
628         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
629         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
630       }
631     } catch(char *ex) {
632       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
633     }
634   }
635   PetscFunctionReturn(0);
636 }
637 
638 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
639 {
640   PetscErrorCode               ierr;
641   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
642   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
643   IS                           isrow = a->row,iscol = a->icol;
644   PetscBool                    row_identity,col_identity;
645   PetscInt                     n = A->rmap->n;
646 
647   PetscFunctionBegin;
648   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
649   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
650   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
651 
652   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
653   cusparseTriFactors->nnz=a->nz;
654 
655   A->offloadmask = PETSC_OFFLOAD_BOTH;
656   /* lower triangular indices */
657   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
658   if (!row_identity && !cusparseTriFactors->rpermIndices) {
659     const PetscInt *r;
660 
661     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
662     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->rpermIndices->assign(r, r+n);
664     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
665     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666   }
667 
668   /* upper triangular indices */
669   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
670   if (!col_identity && !cusparseTriFactors->cpermIndices) {
671     const PetscInt *c;
672 
673     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
674     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->cpermIndices->assign(c, c+n);
676     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
677     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
678   }
679   PetscFunctionReturn(0);
680 }
681 
682 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
683 {
684   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
685   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
686   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
687   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
688   cusparseStatus_t                  stat;
689   PetscErrorCode                    ierr;
690   cudaError_t                       cerr;
691   PetscInt                          *AiUp, *AjUp;
692   PetscScalar                       *AAUp;
693   PetscScalar                       *AALo;
694   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
695   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
696   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
697   const MatScalar                   *aa = b->a,*v;
698 
699   PetscFunctionBegin;
700   if (!n) PetscFunctionReturn(0);
701   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
702     try {
703       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
704       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705       if (!upTriFactor && !loTriFactor) {
706         /* Allocate Space for the upper triangular matrix */
707         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
708         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
709 
710         /* Fill the upper triangular matrix */
711         AiUp[0]=(PetscInt) 0;
712         AiUp[n]=nzUpper;
713         offset = 0;
714         for (i=0; i<n; i++) {
715           /* set the pointers */
716           v  = aa + ai[i];
717           vj = aj + ai[i];
718           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
719 
720           /* first, set the diagonal elements */
721           AjUp[offset] = (PetscInt) i;
722           AAUp[offset] = (MatScalar)1.0/v[nz];
723           AiUp[i]      = offset;
724           AALo[offset] = (MatScalar)1.0/v[nz];
725 
726           offset+=1;
727           if (nz>0) {
728             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
729             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
730             for (j=offset; j<offset+nz; j++) {
731               AAUp[j] = -AAUp[j];
732               AALo[j] = AAUp[j]/v[nz];
733             }
734             offset+=nz;
735           }
736         }
737 
738         /* allocate space for the triangular factor information */
739         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
740         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
741 
742         /* Create the matrix description */
743         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
744         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
745        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
747        #else
748         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
749        #endif
750         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
751         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
752 
753         /* set the matrix */
754         upTriFactor->csrMat = new CsrMatrix;
755         upTriFactor->csrMat->num_rows = A->rmap->n;
756         upTriFactor->csrMat->num_cols = A->cmap->n;
757         upTriFactor->csrMat->num_entries = a->nz;
758 
759         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
760         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
761 
762         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
763         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
764 
765         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
766         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
767 
768         /* set the operation */
769         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
770 
771         /* Create the solve analysis information */
772         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
773         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
774       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
775         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
776                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
777                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
778                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
779                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
780         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
781       #endif
782 
783         /* perform the solve analysis */
784         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
785                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
786                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
787                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
788                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
789                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
790                                 #endif
791 );CHKERRCUSPARSE(stat);
792         cerr = WaitForCUDA();CHKERRCUDA(cerr);
793         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
794 
795         /* assign the pointer */
796         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
797 
798         /* allocate space for the triangular factor information */
799         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
800         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
801 
802         /* Create the matrix description */
803         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
804         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
805        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
807        #else
808         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
809        #endif
810         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
811         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
812 
813         /* set the operation */
814         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
815 
816         /* set the matrix */
817         loTriFactor->csrMat = new CsrMatrix;
818         loTriFactor->csrMat->num_rows = A->rmap->n;
819         loTriFactor->csrMat->num_cols = A->cmap->n;
820         loTriFactor->csrMat->num_entries = a->nz;
821 
822         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
823         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
824 
825         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
826         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
827 
828         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
829         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
830 
831         /* Create the solve analysis information */
832         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
833         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
834       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
835         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
836                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
837                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
839                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
840         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
841       #endif
842 
843         /* perform the solve analysis */
844         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
845                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
846                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
847                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
848                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
849                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
850                                 #endif
851 );CHKERRCUSPARSE(stat);
852         cerr = WaitForCUDA();CHKERRCUDA(cerr);
853         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854 
855         /* assign the pointer */
856         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
857 
858         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
859         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
860         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
861       } else {
862         /* Fill the upper triangular matrix */
863         offset = 0;
864         for (i=0; i<n; i++) {
865           /* set the pointers */
866           v  = aa + ai[i];
867           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
868 
869           /* first, set the diagonal elements */
870           AAUp[offset] = 1.0/v[nz];
871           AALo[offset] = 1.0/v[nz];
872 
873           offset+=1;
874           if (nz>0) {
875             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
876             for (j=offset; j<offset+nz; j++) {
877               AAUp[j] = -AAUp[j];
878               AALo[j] = AAUp[j]/v[nz];
879             }
880             offset+=nz;
881           }
882         }
883         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
884         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
886         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
887         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
888       }
889       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
890       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
891     } catch(char *ex) {
892       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
893     }
894   }
895   PetscFunctionReturn(0);
896 }
897 
898 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
899 {
900   PetscErrorCode               ierr;
901   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
902   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
903   IS                           ip = a->row;
904   PetscBool                    perm_identity;
905   PetscInt                     n = A->rmap->n;
906 
907   PetscFunctionBegin;
908   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
910   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
911   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
912 
913   A->offloadmask = PETSC_OFFLOAD_BOTH;
914 
915   /* lower triangular indices */
916   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
917   if (!perm_identity) {
918     IS             iip;
919     const PetscInt *irip,*rip;
920 
921     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
922     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
923     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
924     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
926     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
927     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
928     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
929     ierr = ISDestroy(&iip);CHKERRQ(ierr);
930     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
931     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
937 {
938   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
939   IS             isrow = b->row,iscol = b->col;
940   PetscBool      row_identity,col_identity;
941   PetscErrorCode ierr;
942 
943   PetscFunctionBegin;
944   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
945   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
946   B->offloadmask = PETSC_OFFLOAD_CPU;
947   /* determine which version of MatSolve needs to be used. */
948   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
949   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
950   if (row_identity && col_identity) {
951     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
952     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
953     B->ops->matsolve = NULL;
954     B->ops->matsolvetranspose = NULL;
955   } else {
956     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
957     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
958     B->ops->matsolve = NULL;
959     B->ops->matsolvetranspose = NULL;
960   }
961 
962   /* get the triangular factors */
963   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
964   PetscFunctionReturn(0);
965 }
966 
967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968 {
969   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970   IS             ip = b->row;
971   PetscBool      perm_identity;
972   PetscErrorCode ierr;
973 
974   PetscFunctionBegin;
975   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977   B->offloadmask = PETSC_OFFLOAD_CPU;
978   /* determine which version of MatSolve needs to be used. */
979   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980   if (perm_identity) {
981     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->matsolve = NULL;
984     B->ops->matsolvetranspose = NULL;
985   } else {
986     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988     B->ops->matsolve = NULL;
989     B->ops->matsolvetranspose = NULL;
990   }
991 
992   /* get the triangular factors */
993   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998 {
999   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004   cusparseStatus_t                  stat;
1005   cusparseIndexBase_t               indexBase;
1006   cusparseMatrixType_t              matrixType;
1007   cusparseFillMode_t                fillMode;
1008   cusparseDiagType_t                diagType;
1009   cudaError_t                       cerr;
1010   PetscErrorCode                    ierr;
1011 
1012   PetscFunctionBegin;
1013   /* allocate space for the transpose of the lower triangular factor */
1014   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016 
1017   /* set the matrix descriptors of the lower triangular factor */
1018   matrixType = cusparseGetMatType(loTriFactor->descr);
1019   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023 
1024   /* Create the matrix description */
1025   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030 
1031   /* set the operation */
1032   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033 
1034   /* allocate GPU space for the CSC of the lower triangular factor*/
1035   loTriFactorT->csrMat = new CsrMatrix;
1036   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042 
1043   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                        loTriFactor->csrMat->values->data().get(),
1048                                        loTriFactor->csrMat->row_offsets->data().get(),
1049                                        loTriFactor->csrMat->column_indices->data().get(),
1050                                        loTriFactorT->csrMat->values->data().get(),
1051                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055 #endif
1056 
1057   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060                           loTriFactor->csrMat->values->data().get(),
1061                           loTriFactor->csrMat->row_offsets->data().get(),
1062                           loTriFactor->csrMat->column_indices->data().get(),
1063                           loTriFactorT->csrMat->values->data().get(),
1064                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068                         #else
1069                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070                           CUSPARSE_ACTION_NUMERIC, indexBase
1071                         #endif
1072 );CHKERRCUSPARSE(stat);
1073   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075 
1076   /* Create the solve analysis information */
1077   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086 #endif
1087 
1088   /* perform the solve analysis */
1089   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095                           #endif
1096 );CHKERRCUSPARSE(stat);
1097   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099 
1100   /* assign the pointer */
1101   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102 
1103   /*********************************************/
1104   /* Now the Transpose of the Upper Tri Factor */
1105   /*********************************************/
1106 
1107   /* allocate space for the transpose of the upper triangular factor */
1108   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110 
1111   /* set the matrix descriptors of the upper triangular factor */
1112   matrixType = cusparseGetMatType(upTriFactor->descr);
1113   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117 
1118   /* Create the matrix description */
1119   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124 
1125   /* set the operation */
1126   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127 
1128   /* allocate GPU space for the CSC of the upper triangular factor*/
1129   upTriFactorT->csrMat = new CsrMatrix;
1130   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136 
1137   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                 upTriFactor->csrMat->values->data().get(),
1142                                 upTriFactor->csrMat->row_offsets->data().get(),
1143                                 upTriFactor->csrMat->column_indices->data().get(),
1144                                 upTriFactorT->csrMat->values->data().get(),
1145                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149 #endif
1150 
1151   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154                           upTriFactor->csrMat->values->data().get(),
1155                           upTriFactor->csrMat->row_offsets->data().get(),
1156                           upTriFactor->csrMat->column_indices->data().get(),
1157                           upTriFactorT->csrMat->values->data().get(),
1158                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162                         #else
1163                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                           CUSPARSE_ACTION_NUMERIC, indexBase
1165                         #endif
1166 );CHKERRCUSPARSE(stat);
1167   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169 
1170   /* Create the solve analysis information */
1171   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180   #endif
1181 
1182   /* perform the solve analysis */
1183   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189                           #endif
1190 );CHKERRCUSPARSE(stat);
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 struct PetscScalarToPetscInt
1200 {
1201   __host__ __device__
1202   PetscInt operator()(PetscScalar s)
1203   {
1204     return (PetscInt)PetscRealPart(s);
1205   }
1206 };
1207 
1208 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1209 {
1210   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213   cusparseStatus_t             stat;
1214   cusparseIndexBase_t          indexBase;
1215   cudaError_t                  err;
1216   PetscErrorCode               ierr;
1217 
1218   PetscFunctionBegin;
1219   if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1220   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1221   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224   if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225   if (cusparsestruct->transupdated) PetscFunctionReturn(0);
1226   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1227   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229   }
1230   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236 
1237     /* set alpha and beta */
1238     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244 
1245     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246       CsrMatrix *matrixT = new CsrMatrix;
1247       matstructT->mat = matrixT;
1248       matrixT->num_rows = A->cmap->n;
1249       matrixT->num_cols = A->rmap->n;
1250       matrixT->num_entries = a->nz;
1251       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253       matrixT->values = new THRUSTARRAY(a->nz);
1254 
1255       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257 
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       stat = cusparseCreateCsr(&matstructT->matDescr,
1260                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262                                matrixT->values->data().get(),
1263                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265      #endif
1266     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269    #else
1270       CsrMatrix *temp  = new CsrMatrix;
1271       CsrMatrix *tempT = new CsrMatrix;
1272       /* First convert HYB to CSR */
1273       temp->num_rows = A->rmap->n;
1274       temp->num_cols = A->cmap->n;
1275       temp->num_entries = a->nz;
1276       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278       temp->values = new THRUSTARRAY(a->nz);
1279 
1280       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282                               temp->values->data().get(),
1283                               temp->row_offsets->data().get(),
1284                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1285 
1286       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287       tempT->num_rows = A->rmap->n;
1288       tempT->num_cols = A->cmap->n;
1289       tempT->num_entries = a->nz;
1290       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292       tempT->values = new THRUSTARRAY(a->nz);
1293 
1294       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295                               temp->num_cols, temp->num_entries,
1296                               temp->values->data().get(),
1297                               temp->row_offsets->data().get(),
1298                               temp->column_indices->data().get(),
1299                               tempT->values->data().get(),
1300                               tempT->column_indices->data().get(),
1301                               tempT->row_offsets->data().get(),
1302                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1303 
1304       /* Last, convert CSC to HYB */
1305       cusparseHybMat_t hybMat;
1306       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310                               matstructT->descr, tempT->values->data().get(),
1311                               tempT->row_offsets->data().get(),
1312                               tempT->column_indices->data().get(),
1313                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1314 
1315       /* assign the pointer */
1316       matstructT->mat = hybMat;
1317       cusparsestruct->transupdated = PETSC_TRUE;
1318       /* delete temporaries */
1319       if (tempT) {
1320         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323         delete (CsrMatrix*) tempT;
1324       }
1325       if (temp) {
1326         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329         delete (CsrMatrix*) temp;
1330       }
1331      #endif
1332     }
1333   }
1334   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1349     }
1350     if (!cusparsestruct->csr2csc_i) {
1351       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1353 
1354       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356       void   *csr2cscBuffer;
1357       size_t csr2cscBufferSize;
1358       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359                                            A->cmap->n, matrix->num_entries,
1360                                            matrix->values->data().get(),
1361                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362                                            matrix->column_indices->data().get(),
1363                                            matrixT->values->data().get(),
1364                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368      #endif
1369 
1370       stat = cusparse_csr2csc(cusparsestruct->handle,
1371                               A->rmap->n,A->cmap->n,matrix->num_entries,
1372                               csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(),
1373                               matrixT->values->data().get(),
1374                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1375                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1376                               CUSPARSE_ACTION_NUMERIC,indexBase,
1377                               cusparsestruct->csr2cscAlg, csr2cscBuffer
1378                              #else
1379                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1380                               CUSPARSE_ACTION_NUMERIC, indexBase
1381                              #endif
1382 );CHKERRCUSPARSE(stat);
1383       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1384       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1385      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1386       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1387      #endif
1388     }
1389     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1390                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1391                                                      matrixT->values->begin()));
1392   }
1393   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1394   /* the compressed row indices is not used for matTranspose */
1395   matstructT->cprowIndices = NULL;
1396   /* assign the pointer */
1397   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1398   cusparsestruct->transupdated = PETSC_TRUE;
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1403 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1404 {
1405   PetscInt                              n = xx->map->n;
1406   const PetscScalar                     *barray;
1407   PetscScalar                           *xarray;
1408   thrust::device_ptr<const PetscScalar> bGPU;
1409   thrust::device_ptr<PetscScalar>       xGPU;
1410   cusparseStatus_t                      stat;
1411   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1412   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1413   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1414   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1415   PetscErrorCode                        ierr;
1416   cudaError_t                           cerr;
1417 
1418   PetscFunctionBegin;
1419   /* Analyze the matrix and create the transpose ... on the fly */
1420   if (!loTriFactorT && !upTriFactorT) {
1421     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1422     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1423     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1424   }
1425 
1426   /* Get the GPU pointers */
1427   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1428   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1429   xGPU = thrust::device_pointer_cast(xarray);
1430   bGPU = thrust::device_pointer_cast(barray);
1431 
1432   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1433   /* First, reorder with the row permutation */
1434   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1435                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1436                xGPU);
1437 
1438   /* First, solve U */
1439   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1440                         upTriFactorT->csrMat->num_rows,
1441                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1442                         upTriFactorT->csrMat->num_entries,
1443                       #endif
1444                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1445                         upTriFactorT->csrMat->values->data().get(),
1446                         upTriFactorT->csrMat->row_offsets->data().get(),
1447                         upTriFactorT->csrMat->column_indices->data().get(),
1448                         upTriFactorT->solveInfo,
1449                         xarray, tempGPU->data().get()
1450                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1451                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1452                       #endif
1453 );CHKERRCUSPARSE(stat);
1454 
1455   /* Then, solve L */
1456   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1457                         loTriFactorT->csrMat->num_rows,
1458                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1459                         loTriFactorT->csrMat->num_entries,
1460                       #endif
1461                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1462                         loTriFactorT->csrMat->values->data().get(),
1463                         loTriFactorT->csrMat->row_offsets->data().get(),
1464                         loTriFactorT->csrMat->column_indices->data().get(),
1465                         loTriFactorT->solveInfo,
1466                         tempGPU->data().get(), xarray
1467                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1468                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1469                       #endif
1470 );CHKERRCUSPARSE(stat);
1471 
1472   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1473   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1474                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1475                tempGPU->begin());
1476 
1477   /* Copy the temporary to the full solution. */
1478   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1479 
1480   /* restore */
1481   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1482   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1483   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1484   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1485   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1490 {
1491   const PetscScalar                 *barray;
1492   PetscScalar                       *xarray;
1493   cusparseStatus_t                  stat;
1494   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1495   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1496   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1497   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1498   PetscErrorCode                    ierr;
1499   cudaError_t                       cerr;
1500 
1501   PetscFunctionBegin;
1502   /* Analyze the matrix and create the transpose ... on the fly */
1503   if (!loTriFactorT && !upTriFactorT) {
1504     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1505     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1506     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1507   }
1508 
1509   /* Get the GPU pointers */
1510   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1511   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1512 
1513   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1514   /* First, solve U */
1515   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1516                         upTriFactorT->csrMat->num_rows,
1517                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1518                         upTriFactorT->csrMat->num_entries,
1519                       #endif
1520                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1521                         upTriFactorT->csrMat->values->data().get(),
1522                         upTriFactorT->csrMat->row_offsets->data().get(),
1523                         upTriFactorT->csrMat->column_indices->data().get(),
1524                         upTriFactorT->solveInfo,
1525                         barray, tempGPU->data().get()
1526                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1527                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1528                       #endif
1529 );CHKERRCUSPARSE(stat);
1530 
1531   /* Then, solve L */
1532   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1533                         loTriFactorT->csrMat->num_rows,
1534                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1535                         loTriFactorT->csrMat->num_entries,
1536                       #endif
1537                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1538                         loTriFactorT->csrMat->values->data().get(),
1539                         loTriFactorT->csrMat->row_offsets->data().get(),
1540                         loTriFactorT->csrMat->column_indices->data().get(),
1541                         loTriFactorT->solveInfo,
1542                         tempGPU->data().get(), xarray
1543                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1544                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1545                       #endif
1546 );CHKERRCUSPARSE(stat);
1547 
1548   /* restore */
1549   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1550   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1551   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1552   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1553   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1558 {
1559   const PetscScalar                     *barray;
1560   PetscScalar                           *xarray;
1561   thrust::device_ptr<const PetscScalar> bGPU;
1562   thrust::device_ptr<PetscScalar>       xGPU;
1563   cusparseStatus_t                      stat;
1564   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1565   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1566   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1567   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1568   PetscErrorCode                        ierr;
1569   cudaError_t                           cerr;
1570 
1571   PetscFunctionBegin;
1572 
1573   /* Get the GPU pointers */
1574   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1575   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1576   xGPU = thrust::device_pointer_cast(xarray);
1577   bGPU = thrust::device_pointer_cast(barray);
1578 
1579   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1580   /* First, reorder with the row permutation */
1581   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1582                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1583                tempGPU->begin());
1584 
1585   /* Next, solve L */
1586   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1587                         loTriFactor->csrMat->num_rows,
1588                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1589                         loTriFactor->csrMat->num_entries,
1590                       #endif
1591                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1592                         loTriFactor->csrMat->values->data().get(),
1593                         loTriFactor->csrMat->row_offsets->data().get(),
1594                         loTriFactor->csrMat->column_indices->data().get(),
1595                         loTriFactor->solveInfo,
1596                         tempGPU->data().get(), xarray
1597                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1598                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1599                       #endif
1600 );CHKERRCUSPARSE(stat);
1601 
1602   /* Then, solve U */
1603   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1604                         upTriFactor->csrMat->num_rows,
1605                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1606                         upTriFactor->csrMat->num_entries,
1607                       #endif
1608                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1609                         upTriFactor->csrMat->values->data().get(),
1610                         upTriFactor->csrMat->row_offsets->data().get(),
1611                         upTriFactor->csrMat->column_indices->data().get(),
1612                         upTriFactor->solveInfo,
1613                         xarray, tempGPU->data().get()
1614                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1616                       #endif
1617 );CHKERRCUSPARSE(stat);
1618 
1619   /* Last, reorder with the column permutation */
1620   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1621                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1622                xGPU);
1623 
1624   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1625   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1626   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1627   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1628   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1633 {
1634   const PetscScalar                 *barray;
1635   PetscScalar                       *xarray;
1636   cusparseStatus_t                  stat;
1637   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1638   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1639   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1640   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1641   PetscErrorCode                    ierr;
1642   cudaError_t                       cerr;
1643 
1644   PetscFunctionBegin;
1645   /* Get the GPU pointers */
1646   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1647   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1648 
1649   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1650   /* First, solve L */
1651   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1652                         loTriFactor->csrMat->num_rows,
1653                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1654                         loTriFactor->csrMat->num_entries,
1655                       #endif
1656                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1657                         loTriFactor->csrMat->values->data().get(),
1658                         loTriFactor->csrMat->row_offsets->data().get(),
1659                         loTriFactor->csrMat->column_indices->data().get(),
1660                         loTriFactor->solveInfo,
1661                         barray, tempGPU->data().get()
1662                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1663                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1664                       #endif
1665 );CHKERRCUSPARSE(stat);
1666 
1667   /* Next, solve U */
1668   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1669                         upTriFactor->csrMat->num_rows,
1670                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1671                         upTriFactor->csrMat->num_entries,
1672                       #endif
1673                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1674                         upTriFactor->csrMat->values->data().get(),
1675                         upTriFactor->csrMat->row_offsets->data().get(),
1676                         upTriFactor->csrMat->column_indices->data().get(),
1677                         upTriFactor->solveInfo,
1678                         tempGPU->data().get(), xarray
1679                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1680                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1681                       #endif
1682 );CHKERRCUSPARSE(stat);
1683 
1684   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1685   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1686   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1687   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1688   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1693 {
1694   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1695   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1696   cudaError_t        cerr;
1697   PetscErrorCode     ierr;
1698 
1699   PetscFunctionBegin;
1700   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1701     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1702 
1703     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1704     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1705     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1706     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1707     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1708     A->offloadmask = PETSC_OFFLOAD_BOTH;
1709   }
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1714 {
1715   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1716   PetscErrorCode ierr;
1717 
1718   PetscFunctionBegin;
1719   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1720   *array = a->a;
1721   A->offloadmask = PETSC_OFFLOAD_CPU;
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1726 {
1727   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1728   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1729   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1730   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1731   PetscErrorCode               ierr;
1732   cusparseStatus_t             stat;
1733   PetscBool                    both = PETSC_TRUE;
1734   cudaError_t                  err;
1735 
1736   PetscFunctionBegin;
1737   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1738   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1739     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1740       CsrMatrix *matrix;
1741       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1742 
1743       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1744       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1745       matrix->values->assign(a->a, a->a+a->nz);
1746       err  = WaitForCUDA();CHKERRCUDA(err);
1747       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1748       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1749       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1750     } else {
1751       PetscInt nnz;
1752       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1753       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1754       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1755       delete cusparsestruct->workVector;
1756       delete cusparsestruct->rowoffsets_gpu;
1757       cusparsestruct->workVector = NULL;
1758       cusparsestruct->rowoffsets_gpu = NULL;
1759       try {
1760         if (a->compressedrow.use) {
1761           m    = a->compressedrow.nrows;
1762           ii   = a->compressedrow.i;
1763           ridx = a->compressedrow.rindex;
1764         } else {
1765           m    = A->rmap->n;
1766           ii   = a->i;
1767           ridx = NULL;
1768         }
1769         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1770         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1771         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1772         else nnz = a->nz;
1773 
1774         /* create cusparse matrix */
1775         cusparsestruct->nrows = m;
1776         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1777         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1778         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1779         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1780 
1781         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1782         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1783         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1784         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1785         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1786         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1787         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1788 
1789         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1790         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1791           /* set the matrix */
1792           CsrMatrix *mat= new CsrMatrix;
1793           mat->num_rows = m;
1794           mat->num_cols = A->cmap->n;
1795           mat->num_entries = nnz;
1796           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1797           mat->row_offsets->assign(ii, ii + m+1);
1798 
1799           mat->column_indices = new THRUSTINTARRAY32(nnz);
1800           mat->column_indices->assign(a->j, a->j+nnz);
1801 
1802           mat->values = new THRUSTARRAY(nnz);
1803           if (a->a) mat->values->assign(a->a, a->a+nnz);
1804 
1805           /* assign the pointer */
1806           matstruct->mat = mat;
1807          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1808           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1809             stat = cusparseCreateCsr(&matstruct->matDescr,
1810                                     mat->num_rows, mat->num_cols, mat->num_entries,
1811                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1812                                     mat->values->data().get(),
1813                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1814                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1815           }
1816          #endif
1817         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1818          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1819           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1820          #else
1821           CsrMatrix *mat= new CsrMatrix;
1822           mat->num_rows = m;
1823           mat->num_cols = A->cmap->n;
1824           mat->num_entries = nnz;
1825           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1826           mat->row_offsets->assign(ii, ii + m+1);
1827 
1828           mat->column_indices = new THRUSTINTARRAY32(nnz);
1829           mat->column_indices->assign(a->j, a->j+nnz);
1830 
1831           mat->values = new THRUSTARRAY(nnz);
1832           if (a->a) mat->values->assign(a->a, a->a+nnz);
1833 
1834           cusparseHybMat_t hybMat;
1835           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1836           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1837             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1838           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1839               matstruct->descr, mat->values->data().get(),
1840               mat->row_offsets->data().get(),
1841               mat->column_indices->data().get(),
1842               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1843           /* assign the pointer */
1844           matstruct->mat = hybMat;
1845 
1846           if (mat) {
1847             if (mat->values) delete (THRUSTARRAY*)mat->values;
1848             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1849             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1850             delete (CsrMatrix*)mat;
1851           }
1852          #endif
1853         }
1854 
1855         /* assign the compressed row indices */
1856         if (a->compressedrow.use) {
1857           cusparsestruct->workVector = new THRUSTARRAY(m);
1858           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1859           matstruct->cprowIndices->assign(ridx,ridx+m);
1860           tmp = m;
1861         } else {
1862           cusparsestruct->workVector = NULL;
1863           matstruct->cprowIndices    = NULL;
1864           tmp = 0;
1865         }
1866         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1867 
1868         /* assign the pointer */
1869         cusparsestruct->mat = matstruct;
1870       } catch(char *ex) {
1871         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1872       }
1873       err  = WaitForCUDA();CHKERRCUDA(err);
1874       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1875       cusparsestruct->nonzerostate = A->nonzerostate;
1876     }
1877     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1878   }
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 struct VecCUDAPlusEquals
1883 {
1884   template <typename Tuple>
1885   __host__ __device__
1886   void operator()(Tuple t)
1887   {
1888     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1889   }
1890 };
1891 
1892 struct VecCUDAEquals
1893 {
1894   template <typename Tuple>
1895   __host__ __device__
1896   void operator()(Tuple t)
1897   {
1898     thrust::get<1>(t) = thrust::get<0>(t);
1899   }
1900 };
1901 
1902 struct VecCUDAEqualsReverse
1903 {
1904   template <typename Tuple>
1905   __host__ __device__
1906   void operator()(Tuple t)
1907   {
1908     thrust::get<0>(t) = thrust::get<1>(t);
1909   }
1910 };
1911 
1912 struct MatMatCusparse {
1913   PetscBool             cisdense;
1914   PetscScalar           *Bt;
1915   Mat                   X;
1916   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1917   PetscLogDouble        flops;
1918   CsrMatrix             *Bcsr;
1919 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1920   cusparseSpMatDescr_t  matSpBDescr;
1921   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1922   cusparseDnMatDescr_t  matBDescr;
1923   cusparseDnMatDescr_t  matCDescr;
1924   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1925   size_t                mmBufferSize;
1926   void                  *mmBuffer;
1927   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1928   cusparseSpGEMMDescr_t spgemmDesc;
1929 #endif
1930 };
1931 
1932 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1933 {
1934   PetscErrorCode   ierr;
1935   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1936   cudaError_t      cerr;
1937  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938   cusparseStatus_t stat;
1939  #endif
1940 
1941   PetscFunctionBegin;
1942   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1943   delete mmdata->Bcsr;
1944  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1945   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1946   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1947   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1948   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1949   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1950   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1951  #endif
1952   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1953   ierr = PetscFree(data);CHKERRQ(ierr);
1954   PetscFunctionReturn(0);
1955 }
1956 
1957 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1958 
1959 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1960 {
1961   Mat_Product                  *product = C->product;
1962   Mat                          A,B;
1963   PetscInt                     m,n,blda,clda;
1964   PetscBool                    flg,biscuda;
1965   Mat_SeqAIJCUSPARSE           *cusp;
1966   cusparseStatus_t             stat;
1967   cusparseOperation_t          opA;
1968   const PetscScalar            *barray;
1969   PetscScalar                  *carray;
1970   PetscErrorCode               ierr;
1971   MatMatCusparse               *mmdata;
1972   Mat_SeqAIJCUSPARSEMultStruct *mat;
1973   CsrMatrix                    *csrmat;
1974   cudaError_t                  cerr;
1975 
1976   PetscFunctionBegin;
1977   MatCheckProduct(C,1);
1978   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1979   mmdata = (MatMatCusparse*)product->data;
1980   A    = product->A;
1981   B    = product->B;
1982   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1983   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1984   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1985      Instead of silently accepting the wrong answer, I prefer to raise the error */
1986   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1987   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1988   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1989   switch (product->type) {
1990   case MATPRODUCT_AB:
1991   case MATPRODUCT_PtAP:
1992     mat = cusp->mat;
1993     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1994     m   = A->rmap->n;
1995     n   = B->cmap->n;
1996     break;
1997   case MATPRODUCT_AtB:
1998     if (!cusp->transgen) {
1999       mat = cusp->mat;
2000       opA = CUSPARSE_OPERATION_TRANSPOSE;
2001     } else {
2002       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2003       mat  = cusp->matTranspose;
2004       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2005     }
2006     m = A->cmap->n;
2007     n = B->cmap->n;
2008     break;
2009   case MATPRODUCT_ABt:
2010   case MATPRODUCT_RARt:
2011     mat = cusp->mat;
2012     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2013     m   = A->rmap->n;
2014     n   = B->rmap->n;
2015     break;
2016   default:
2017     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2018   }
2019   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2020   csrmat = (CsrMatrix*)mat->mat;
2021   /* if the user passed a CPU matrix, copy the data to the GPU */
2022   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2023   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2024   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2025 
2026   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2027   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2028     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2029     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2030   } else {
2031     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2032     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2033   }
2034 
2035   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2036  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2037   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2038   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2039   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2040     size_t mmBufferSize;
2041     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2042     if (!mmdata->matBDescr) {
2043       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2044       mmdata->Blda = blda;
2045     }
2046 
2047     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2048     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2049       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2050       mmdata->Clda = clda;
2051     }
2052 
2053     if (!mat->matDescr) {
2054       stat = cusparseCreateCsr(&mat->matDescr,
2055                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2056                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2057                                csrmat->values->data().get(),
2058                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2059                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2060     }
2061     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2062                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2063                                    mmdata->matCDescr,cusparse_scalartype,
2064                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2065     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2066       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2067       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2068       mmdata->mmBufferSize = mmBufferSize;
2069     }
2070     mmdata->initialized = PETSC_TRUE;
2071   } else {
2072     /* to be safe, always update pointers of the mats */
2073     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2074     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2075     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2076   }
2077 
2078   /* do cusparseSpMM, which supports transpose on B */
2079   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2080                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2081                       mmdata->matCDescr,cusparse_scalartype,
2082                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2083  #else
2084   PetscInt k;
2085   /* cusparseXcsrmm does not support transpose on B */
2086   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2087     cublasHandle_t cublasv2handle;
2088     cublasStatus_t cerr;
2089 
2090     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2091     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2092                        B->cmap->n,B->rmap->n,
2093                        &PETSC_CUSPARSE_ONE ,barray,blda,
2094                        &PETSC_CUSPARSE_ZERO,barray,blda,
2095                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2096     blda = B->cmap->n;
2097     k    = B->cmap->n;
2098   } else {
2099     k    = B->rmap->n;
2100   }
2101 
2102   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2103   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2104                            csrmat->num_entries,mat->alpha_one,mat->descr,
2105                            csrmat->values->data().get(),
2106                            csrmat->row_offsets->data().get(),
2107                            csrmat->column_indices->data().get(),
2108                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2109                            carray,clda);CHKERRCUSPARSE(stat);
2110  #endif
2111   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2112   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2113   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2114   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2115   if (product->type == MATPRODUCT_RARt) {
2116     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2117     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2118   } else if (product->type == MATPRODUCT_PtAP) {
2119     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2120     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2121   } else {
2122     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2123   }
2124   if (mmdata->cisdense) {
2125     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2126   }
2127   if (!biscuda) {
2128     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2129   }
2130   PetscFunctionReturn(0);
2131 }
2132 
2133 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2134 {
2135   Mat_Product        *product = C->product;
2136   Mat                A,B;
2137   PetscInt           m,n;
2138   PetscBool          cisdense,flg;
2139   PetscErrorCode     ierr;
2140   MatMatCusparse     *mmdata;
2141   Mat_SeqAIJCUSPARSE *cusp;
2142 
2143   PetscFunctionBegin;
2144   MatCheckProduct(C,1);
2145   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2146   A    = product->A;
2147   B    = product->B;
2148   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2149   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2150   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2151   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2152   switch (product->type) {
2153   case MATPRODUCT_AB:
2154     m = A->rmap->n;
2155     n = B->cmap->n;
2156     break;
2157   case MATPRODUCT_AtB:
2158     m = A->cmap->n;
2159     n = B->cmap->n;
2160     break;
2161   case MATPRODUCT_ABt:
2162     m = A->rmap->n;
2163     n = B->rmap->n;
2164     break;
2165   case MATPRODUCT_PtAP:
2166     m = B->cmap->n;
2167     n = B->cmap->n;
2168     break;
2169   case MATPRODUCT_RARt:
2170     m = B->rmap->n;
2171     n = B->rmap->n;
2172     break;
2173   default:
2174     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2175   }
2176   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2177   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2178   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2179   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2180 
2181   /* product data */
2182   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2183   mmdata->cisdense = cisdense;
2184  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2185   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2186   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2187     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2188   }
2189  #endif
2190   /* for these products we need intermediate storage */
2191   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2192     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2193     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2194     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2195       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2196     } else {
2197       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2198     }
2199   }
2200   C->product->data    = mmdata;
2201   C->product->destroy = MatDestroy_MatMatCusparse;
2202 
2203   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2204   PetscFunctionReturn(0);
2205 }
2206 
2207 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2208 {
2209   Mat_Product                  *product = C->product;
2210   Mat                          A,B;
2211   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2212   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2213   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2214   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2215   PetscBool                    flg;
2216   PetscErrorCode               ierr;
2217   cusparseStatus_t             stat;
2218   cudaError_t                  cerr;
2219   MatProductType               ptype;
2220   MatMatCusparse               *mmdata;
2221 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2222   cusparseSpMatDescr_t         BmatSpDescr;
2223 #endif
2224 
2225   PetscFunctionBegin;
2226   MatCheckProduct(C,1);
2227   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2228   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2229   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2230   mmdata = (MatMatCusparse*)C->product->data;
2231   A = product->A;
2232   B = product->B;
2233   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2234     mmdata->reusesym = PETSC_FALSE;
2235     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2236     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2237     Cmat = Ccusp->mat;
2238     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2239     Ccsr = (CsrMatrix*)Cmat->mat;
2240     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2241     goto finalize;
2242   }
2243   if (!c->nz) goto finalize;
2244   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2245   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2246   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2248   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2249   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2250   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2251   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2252   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2253   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2254   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2255   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2256   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2257   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2258 
2259   ptype = product->type;
2260   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2261   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2262   switch (ptype) {
2263   case MATPRODUCT_AB:
2264     Amat = Acusp->mat;
2265     Bmat = Bcusp->mat;
2266     break;
2267   case MATPRODUCT_AtB:
2268     Amat = Acusp->matTranspose;
2269     Bmat = Bcusp->mat;
2270     break;
2271   case MATPRODUCT_ABt:
2272     Amat = Acusp->mat;
2273     Bmat = Bcusp->matTranspose;
2274     break;
2275   default:
2276     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2277   }
2278   Cmat = Ccusp->mat;
2279   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2280   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2281   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2282   Acsr = (CsrMatrix*)Amat->mat;
2283   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2284   Ccsr = (CsrMatrix*)Cmat->mat;
2285   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2286   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2287   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2288   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2289 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2290   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2291   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2292                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2293                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2294                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2295   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2296                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2297                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2298 #else
2299   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2300                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2301                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2302                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2303                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2304 #endif
2305   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2306   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2307   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2308   C->offloadmask = PETSC_OFFLOAD_GPU;
2309 finalize:
2310   /* shorter version of MatAssemblyEnd_SeqAIJ */
2311   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2312   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2313   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2314   c->reallocs         = 0;
2315   C->info.mallocs    += 0;
2316   C->info.nz_unneeded = 0;
2317   C->assembled = C->was_assembled = PETSC_TRUE;
2318   C->num_ass++;
2319   PetscFunctionReturn(0);
2320 }
2321 
2322 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2323 {
2324   Mat_Product                  *product = C->product;
2325   Mat                          A,B;
2326   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2327   Mat_SeqAIJ                   *a,*b,*c;
2328   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2329   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2330   PetscInt                     i,j,m,n,k;
2331   PetscBool                    flg;
2332   PetscErrorCode               ierr;
2333   cusparseStatus_t             stat;
2334   cudaError_t                  cerr;
2335   MatProductType               ptype;
2336   MatMatCusparse               *mmdata;
2337   PetscLogDouble               flops;
2338   PetscBool                    biscompressed,ciscompressed;
2339 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2340   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2341   size_t                       bufSize2;
2342   cusparseSpMatDescr_t         BmatSpDescr;
2343 #else
2344   int                          cnz;
2345 #endif
2346 
2347   PetscFunctionBegin;
2348   MatCheckProduct(C,1);
2349   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2350   A    = product->A;
2351   B    = product->B;
2352   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2353   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2354   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2355   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2356   a = (Mat_SeqAIJ*)A->data;
2357   b = (Mat_SeqAIJ*)B->data;
2358   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2359   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2360   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2361   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2362 
2363   /* product data */
2364   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2365   C->product->data    = mmdata;
2366   C->product->destroy = MatDestroy_MatMatCusparse;
2367 
2368   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2369   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2370   ptype = product->type;
2371   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2372   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2373   biscompressed = PETSC_FALSE;
2374   ciscompressed = PETSC_FALSE;
2375   switch (ptype) {
2376   case MATPRODUCT_AB:
2377     m = A->rmap->n;
2378     n = B->cmap->n;
2379     k = A->cmap->n;
2380     Amat = Acusp->mat;
2381     Bmat = Bcusp->mat;
2382     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2383     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2384     break;
2385   case MATPRODUCT_AtB:
2386     m = A->cmap->n;
2387     n = B->cmap->n;
2388     k = A->rmap->n;
2389     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2390     Amat = Acusp->matTranspose;
2391     Bmat = Bcusp->mat;
2392     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2393     break;
2394   case MATPRODUCT_ABt:
2395     m = A->rmap->n;
2396     n = B->rmap->n;
2397     k = A->cmap->n;
2398     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2399     Amat = Acusp->mat;
2400     Bmat = Bcusp->matTranspose;
2401     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2402     break;
2403   default:
2404     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2405   }
2406 
2407   /* create cusparse matrix */
2408   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2409   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2410   c     = (Mat_SeqAIJ*)C->data;
2411   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2412   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2413   Ccsr  = new CsrMatrix;
2414 
2415   c->compressedrow.use = ciscompressed;
2416   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2417     c->compressedrow.nrows = a->compressedrow.nrows;
2418     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2419     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2420     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2421     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2422     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2423   } else {
2424     c->compressedrow.nrows  = 0;
2425     c->compressedrow.i      = NULL;
2426     c->compressedrow.rindex = NULL;
2427     Ccusp->workVector       = NULL;
2428     Cmat->cprowIndices      = NULL;
2429   }
2430   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2431   Ccusp->mat      = Cmat;
2432   Ccusp->mat->mat = Ccsr;
2433   Ccsr->num_rows    = Ccusp->nrows;
2434   Ccsr->num_cols    = n;
2435   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2436   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2437   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2438   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2439   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2440   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2441   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2442   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2443   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2444   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2445   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2446     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2447     c->nz = 0;
2448     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2449     Ccsr->values = new THRUSTARRAY(c->nz);
2450     goto finalizesym;
2451   }
2452 
2453   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2454   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2455   Acsr = (CsrMatrix*)Amat->mat;
2456   if (!biscompressed) {
2457     Bcsr = (CsrMatrix*)Bmat->mat;
2458 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2459     BmatSpDescr = Bmat->matDescr;
2460 #endif
2461   } else { /* we need to use row offsets for the full matrix */
2462     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2463     Bcsr = new CsrMatrix;
2464     Bcsr->num_rows       = B->rmap->n;
2465     Bcsr->num_cols       = cBcsr->num_cols;
2466     Bcsr->num_entries    = cBcsr->num_entries;
2467     Bcsr->column_indices = cBcsr->column_indices;
2468     Bcsr->values         = cBcsr->values;
2469     if (!Bcusp->rowoffsets_gpu) {
2470       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2471       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2472       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2473     }
2474     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2475     mmdata->Bcsr = Bcsr;
2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2477     if (Bcsr->num_rows && Bcsr->num_cols) {
2478       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2479                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2480                                Bcsr->values->data().get(),
2481                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2482                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2483     }
2484     BmatSpDescr = mmdata->matSpBDescr;
2485 #endif
2486   }
2487   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2488   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2489   /* precompute flops count */
2490   if (ptype == MATPRODUCT_AB) {
2491     for (i=0, flops = 0; i<A->rmap->n; i++) {
2492       const PetscInt st = a->i[i];
2493       const PetscInt en = a->i[i+1];
2494       for (j=st; j<en; j++) {
2495         const PetscInt brow = a->j[j];
2496         flops += 2.*(b->i[brow+1] - b->i[brow]);
2497       }
2498     }
2499   } else if (ptype == MATPRODUCT_AtB) {
2500     for (i=0, flops = 0; i<A->rmap->n; i++) {
2501       const PetscInt anzi = a->i[i+1] - a->i[i];
2502       const PetscInt bnzi = b->i[i+1] - b->i[i];
2503       flops += (2.*anzi)*bnzi;
2504     }
2505   } else { /* TODO */
2506     flops = 0.;
2507   }
2508 
2509   mmdata->flops = flops;
2510   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2511 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2512   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2513   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2514                            NULL, NULL, NULL,
2515                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2516                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2517   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2518   /* ask bufferSize bytes for external memory */
2519   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2520                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2521                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2522                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2523   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2524   /* inspect the matrices A and B to understand the memory requirement for the next step */
2525   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2526                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2527                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2528                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2529   /* ask bufferSize again bytes for external memory */
2530   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2531                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2532                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2533                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2534   /* The CUSPARSE documentation is not clear, nor the API
2535      We need both buffers to perform the operations properly!
2536      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2537      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2538      is stored in the descriptor! What a messy API... */
2539   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2540   /* compute the intermediate product of A * B */
2541   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2542                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2543                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2544                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2545   /* get matrix C non-zero entries C_nnz1 */
2546   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2547   c->nz = (PetscInt) C_nnz1;
2548   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2549   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2550   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2551   Ccsr->values = new THRUSTARRAY(c->nz);
2552   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2553   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2554                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2555   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2556                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2557                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2558 #else
2559   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2560   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2561                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2562                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2563                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2564                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2565   c->nz = cnz;
2566   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2567   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2568   Ccsr->values = new THRUSTARRAY(c->nz);
2569   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2570 
2571   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2572   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2573      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2574      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2575   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2576                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2577                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2578                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2579                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2580 #endif
2581   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2582   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2583   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2584 finalizesym:
2585   c->singlemalloc = PETSC_FALSE;
2586   c->free_a       = PETSC_TRUE;
2587   c->free_ij      = PETSC_TRUE;
2588   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2589   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2590   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2591     PetscInt *d_i = c->i;
2592     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2593     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2594     ii   = *Ccsr->row_offsets;
2595     jj   = *Ccsr->column_indices;
2596     if (ciscompressed) d_i = c->compressedrow.i;
2597     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2598     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2599   } else {
2600     PetscInt *d_i = c->i;
2601     if (ciscompressed) d_i = c->compressedrow.i;
2602     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2603     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2604   }
2605   if (ciscompressed) { /* need to expand host row offsets */
2606     PetscInt r = 0;
2607     c->i[0] = 0;
2608     for (k = 0; k < c->compressedrow.nrows; k++) {
2609       const PetscInt next = c->compressedrow.rindex[k];
2610       const PetscInt old = c->compressedrow.i[k];
2611       for (; r < next; r++) c->i[r+1] = old;
2612     }
2613     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2614   }
2615   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2616   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2617   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2618   c->maxnz = c->nz;
2619   c->nonzerorowcnt = 0;
2620   c->rmax = 0;
2621   for (k = 0; k < m; k++) {
2622     const PetscInt nn = c->i[k+1] - c->i[k];
2623     c->ilen[k] = c->imax[k] = nn;
2624     c->nonzerorowcnt += (PetscInt)!!nn;
2625     c->rmax = PetscMax(c->rmax,nn);
2626   }
2627   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2628   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2629   Ccsr->num_entries = c->nz;
2630 
2631   C->nonzerostate++;
2632   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2633   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2634   Ccusp->nonzerostate = C->nonzerostate;
2635   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2636   C->preallocated  = PETSC_TRUE;
2637   C->assembled     = PETSC_FALSE;
2638   C->was_assembled = PETSC_FALSE;
2639   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2640     mmdata->reusesym = PETSC_TRUE;
2641     C->offloadmask   = PETSC_OFFLOAD_GPU;
2642   }
2643   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2644   PetscFunctionReturn(0);
2645 }
2646 
2647 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2648 
2649 /* handles sparse or dense B */
2650 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2651 {
2652   Mat_Product    *product = mat->product;
2653   PetscErrorCode ierr;
2654   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2655 
2656   PetscFunctionBegin;
2657   MatCheckProduct(mat,1);
2658   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2659   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2660     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2661   }
2662   if (product->type == MATPRODUCT_ABC) {
2663     Ciscusp = PETSC_FALSE;
2664     if (!product->C->boundtocpu) {
2665       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2666     }
2667   }
2668   if (isdense) {
2669     switch (product->type) {
2670     case MATPRODUCT_AB:
2671     case MATPRODUCT_AtB:
2672     case MATPRODUCT_ABt:
2673     case MATPRODUCT_PtAP:
2674     case MATPRODUCT_RARt:
2675      if (product->A->boundtocpu) {
2676         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2677       } else {
2678         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2679       }
2680       break;
2681     case MATPRODUCT_ABC:
2682       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2683       break;
2684     default:
2685       break;
2686     }
2687   } else if (Biscusp && Ciscusp) {
2688     switch (product->type) {
2689     case MATPRODUCT_AB:
2690     case MATPRODUCT_AtB:
2691     case MATPRODUCT_ABt:
2692       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2693       break;
2694     case MATPRODUCT_PtAP:
2695     case MATPRODUCT_RARt:
2696     case MATPRODUCT_ABC:
2697       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2698       break;
2699     default:
2700       break;
2701     }
2702   } else { /* fallback for AIJ */
2703     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2704   }
2705   PetscFunctionReturn(0);
2706 }
2707 
2708 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2709 {
2710   PetscErrorCode ierr;
2711 
2712   PetscFunctionBegin;
2713   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2714   PetscFunctionReturn(0);
2715 }
2716 
2717 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2718 {
2719   PetscErrorCode ierr;
2720 
2721   PetscFunctionBegin;
2722   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2723   PetscFunctionReturn(0);
2724 }
2725 
2726 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2727 {
2728   PetscErrorCode ierr;
2729 
2730   PetscFunctionBegin;
2731   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2732   PetscFunctionReturn(0);
2733 }
2734 
2735 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2736 {
2737   PetscErrorCode ierr;
2738 
2739   PetscFunctionBegin;
2740   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2741   PetscFunctionReturn(0);
2742 }
2743 
2744 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2745 {
2746   PetscErrorCode ierr;
2747 
2748   PetscFunctionBegin;
2749   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2750   PetscFunctionReturn(0);
2751 }
2752 
2753 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2754 {
2755   int i = blockIdx.x*blockDim.x + threadIdx.x;
2756   if (i < n) y[idx[i]] += x[i];
2757 }
2758 
2759 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2760 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2761 {
2762   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2763   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2764   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2765   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2766   PetscErrorCode               ierr;
2767   cudaError_t                  cerr;
2768   cusparseStatus_t             stat;
2769   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2770   PetscBool                    compressed;
2771 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2772   PetscInt                     nx,ny;
2773 #endif
2774 
2775   PetscFunctionBegin;
2776   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2777   if (!a->nonzerorowcnt) {
2778     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2779     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2780     PetscFunctionReturn(0);
2781   }
2782   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2783   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2784   if (!trans) {
2785     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2786     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2787   } else {
2788     if (herm || !cusparsestruct->transgen) {
2789       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2790       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2791     } else {
2792       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2793       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2794     }
2795   }
2796   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2797   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2798 
2799   try {
2800     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2801     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2802     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2803 
2804     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2805     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2806       /* z = A x + beta y.
2807          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2808          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2809       */
2810       xptr = xarray;
2811       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2812       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2813      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2814       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2815           allocated to accommodate different uses. So we get the length info directly from mat.
2816        */
2817       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2818         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2819         nx = mat->num_cols;
2820         ny = mat->num_rows;
2821       }
2822      #endif
2823     } else {
2824       /* z = A^T x + beta y
2825          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2826          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2827        */
2828       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2829       dptr = zarray;
2830       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2831       if (compressed) { /* Scatter x to work vector */
2832         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2833         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2834                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2835                          VecCUDAEqualsReverse());
2836       }
2837      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2838       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2839         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2840         nx = mat->num_rows;
2841         ny = mat->num_cols;
2842       }
2843      #endif
2844     }
2845 
2846     /* csr_spmv does y = alpha op(A) x + beta y */
2847     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2848      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2850       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2851         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2852         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2853         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2854                                 matstruct->matDescr,
2855                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2856                                 matstruct->cuSpMV[opA].vecYDescr,
2857                                 cusparse_scalartype,
2858                                 cusparsestruct->spmvAlg,
2859                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2860         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2861 
2862         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2863       } else {
2864         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2865         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2866         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2867       }
2868 
2869       stat = cusparseSpMV(cusparsestruct->handle, opA,
2870                                matstruct->alpha_one,
2871                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2872                                matstruct->cuSpMV[opA].vecXDescr,
2873                                beta,
2874                                matstruct->cuSpMV[opA].vecYDescr,
2875                                cusparse_scalartype,
2876                                cusparsestruct->spmvAlg,
2877                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2878      #else
2879       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2880       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2881                                mat->num_rows, mat->num_cols,
2882                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2883                                mat->values->data().get(), mat->row_offsets->data().get(),
2884                                mat->column_indices->data().get(), xptr, beta,
2885                                dptr);CHKERRCUSPARSE(stat);
2886      #endif
2887     } else {
2888       if (cusparsestruct->nrows) {
2889        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2890         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2891        #else
2892         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2893         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2894                                  matstruct->alpha_one, matstruct->descr, hybMat,
2895                                  xptr, beta,
2896                                  dptr);CHKERRCUSPARSE(stat);
2897        #endif
2898       }
2899     }
2900     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2901     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2902 
2903     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2904       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2905         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2906           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2907         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2908           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2909         }
2910       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2911         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2912       }
2913 
2914       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2915       if (compressed) {
2916         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2917         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2918            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2919            prevent that. So I just add a ScatterAdd kernel.
2920          */
2921        #if 0
2922         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2923         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2924                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2925                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2926                          VecCUDAPlusEquals());
2927        #else
2928         PetscInt n = matstruct->cprowIndices->size();
2929         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2930        #endif
2931         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2932         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2933       }
2934     } else {
2935       if (yy && yy != zz) {
2936         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2937       }
2938     }
2939     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2940     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2941     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2942   } catch(char *ex) {
2943     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2944   }
2945   if (yy) {
2946     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2947   } else {
2948     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2949   }
2950   PetscFunctionReturn(0);
2951 }
2952 
2953 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2954 {
2955   PetscErrorCode ierr;
2956 
2957   PetscFunctionBegin;
2958   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2959   PetscFunctionReturn(0);
2960 }
2961 
2962 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2963 {
2964   PetscErrorCode              ierr;
2965   PetscSplitCSRDataStructure  *d_mat = NULL;
2966   PetscFunctionBegin;
2967   if (A->factortype == MAT_FACTOR_NONE) {
2968     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2969   }
2970   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2971   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2972   if (d_mat) {
2973     A->offloadmask = PETSC_OFFLOAD_GPU;
2974   }
2975 
2976   PetscFunctionReturn(0);
2977 }
2978 
2979 /* --------------------------------------------------------------------------------*/
2980 /*@
2981    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2982    (the default parallel PETSc format). This matrix will ultimately pushed down
2983    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2984    assembly performance the user should preallocate the matrix storage by setting
2985    the parameter nz (or the array nnz).  By setting these parameters accurately,
2986    performance during matrix assembly can be increased by more than a factor of 50.
2987 
2988    Collective
2989 
2990    Input Parameters:
2991 +  comm - MPI communicator, set to PETSC_COMM_SELF
2992 .  m - number of rows
2993 .  n - number of columns
2994 .  nz - number of nonzeros per row (same for all rows)
2995 -  nnz - array containing the number of nonzeros in the various rows
2996          (possibly different for each row) or NULL
2997 
2998    Output Parameter:
2999 .  A - the matrix
3000 
3001    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3002    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3003    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3004 
3005    Notes:
3006    If nnz is given then nz is ignored
3007 
3008    The AIJ format (also called the Yale sparse matrix format or
3009    compressed row storage), is fully compatible with standard Fortran 77
3010    storage.  That is, the stored row and column indices can begin at
3011    either one (as in Fortran) or zero.  See the users' manual for details.
3012 
3013    Specify the preallocated storage with either nz or nnz (not both).
3014    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3015    allocation.  For large problems you MUST preallocate memory or you
3016    will get TERRIBLE performance, see the users' manual chapter on matrices.
3017 
3018    By default, this format uses inodes (identical nodes) when possible, to
3019    improve numerical efficiency of matrix-vector products and solves. We
3020    search for consecutive rows with the same nonzero structure, thereby
3021    reusing matrix information to achieve increased efficiency.
3022 
3023    Level: intermediate
3024 
3025 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3026 @*/
3027 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3028 {
3029   PetscErrorCode ierr;
3030 
3031   PetscFunctionBegin;
3032   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3033   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3034   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3035   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3036   PetscFunctionReturn(0);
3037 }
3038 
3039 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3040 {
3041   PetscErrorCode              ierr;
3042   PetscSplitCSRDataStructure  *d_mat = NULL;
3043 
3044   PetscFunctionBegin;
3045   if (A->factortype == MAT_FACTOR_NONE) {
3046     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3047     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3048     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3049   } else {
3050     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3051   }
3052   if (d_mat) {
3053     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3054     cudaError_t                err;
3055     PetscSplitCSRDataStructure h_mat;
3056     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3057     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3058     if (a->compressedrow.use) {
3059       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3060     }
3061     err = cudaFree(d_mat);CHKERRCUDA(err);
3062   }
3063   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3064   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3065   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3066   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3067   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3068   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3069   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3070   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3071   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3072   PetscFunctionReturn(0);
3073 }
3074 
3075 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3076 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3077 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3078 {
3079   PetscErrorCode ierr;
3080 
3081   PetscFunctionBegin;
3082   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3083   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3084   PetscFunctionReturn(0);
3085 }
3086 
3087 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3088 {
3089   PetscErrorCode     ierr;
3090   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3091   Mat_SeqAIJCUSPARSE *cy;
3092   Mat_SeqAIJCUSPARSE *cx;
3093   PetscScalar        *ay;
3094   const PetscScalar  *ax;
3095   CsrMatrix          *csry,*csrx;
3096   cudaError_t        cerr;
3097 
3098   PetscFunctionBegin;
3099   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3100   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3101   if (X->ops->axpy != Y->ops->axpy) {
3102     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3103     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3104     PetscFunctionReturn(0);
3105   }
3106   /* if we are here, it means both matrices are bound to GPU */
3107   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3108   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3109   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3110   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3111   csry = (CsrMatrix*)cy->mat->mat;
3112   csrx = (CsrMatrix*)cx->mat->mat;
3113   /* see if we can turn this into a cublas axpy */
3114   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3115     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3116     if (eq) {
3117       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3118     }
3119     if (eq) str = SAME_NONZERO_PATTERN;
3120   }
3121   /* spgeam is buggy with one column */
3122   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3123 
3124   if (str == SUBSET_NONZERO_PATTERN) {
3125     cusparseStatus_t stat;
3126     PetscScalar      b = 1.0;
3127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3128     size_t           bufferSize;
3129     void             *buffer;
3130 #endif
3131 
3132     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3133     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3134     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3136     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3137                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3138                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3139                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3140     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3141     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3142     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3143                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3144                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3145                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3146     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3147     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3148     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3149     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3150 #else
3151     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3152     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3153                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3154                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3155                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3156     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3157     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3158     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3159 #endif
3160     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3161     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3162     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3163     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3164   } else if (str == SAME_NONZERO_PATTERN) {
3165     cublasHandle_t cublasv2handle;
3166     cublasStatus_t berr;
3167     PetscBLASInt   one = 1, bnz = 1;
3168 
3169     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3170     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3171     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3172     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3173     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3174     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3175     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3176     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3177     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3178     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3179     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3180     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3181   } else {
3182     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3183     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3184   }
3185   PetscFunctionReturn(0);
3186 }
3187 
3188 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3189 {
3190   PetscErrorCode ierr;
3191   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3192   PetscScalar    *ay;
3193   cudaError_t    cerr;
3194   cublasHandle_t cublasv2handle;
3195   cublasStatus_t berr;
3196   PetscBLASInt   one = 1, bnz = 1;
3197 
3198   PetscFunctionBegin;
3199   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3200   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3201   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3202   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3203   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3204   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3205   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3206   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3207   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3208   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3209   PetscFunctionReturn(0);
3210 }
3211 
3212 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3213 {
3214   PetscErrorCode             ierr;
3215   PetscBool                  both = PETSC_FALSE;
3216   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3217 
3218   PetscFunctionBegin;
3219   if (A->factortype == MAT_FACTOR_NONE) {
3220     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3221     if (spptr->mat) {
3222       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3223       if (matrix->values) {
3224         both = PETSC_TRUE;
3225         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3226       }
3227     }
3228     if (spptr->matTranspose) {
3229       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3230       if (matrix->values) {
3231         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3232       }
3233     }
3234   }
3235   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3236   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3237   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3238   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3239   else A->offloadmask = PETSC_OFFLOAD_CPU;
3240 
3241   PetscFunctionReturn(0);
3242 }
3243 
3244 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3245 {
3246   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3247   PetscErrorCode ierr;
3248 
3249   PetscFunctionBegin;
3250   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3251   if (flg) {
3252     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3253 
3254     A->ops->scale                     = MatScale_SeqAIJ;
3255     A->ops->axpy                      = MatAXPY_SeqAIJ;
3256     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3257     A->ops->mult                      = MatMult_SeqAIJ;
3258     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3259     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3260     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3261     A->ops->multhermitiantranspose    = NULL;
3262     A->ops->multhermitiantransposeadd = NULL;
3263     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3264     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3265     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3266     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3267     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3268     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3269     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3270     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3271   } else {
3272     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3273     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3274     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3275     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3276     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3277     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3278     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3279     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3280     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3281     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3282     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3283     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3284     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3285     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3286     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3287     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3288     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3289   }
3290   A->boundtocpu = flg;
3291   a->inode.use = flg;
3292   PetscFunctionReturn(0);
3293 }
3294 
3295 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3296 {
3297   PetscErrorCode   ierr;
3298   cusparseStatus_t stat;
3299   Mat              B;
3300 
3301   PetscFunctionBegin;
3302   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3303   if (reuse == MAT_INITIAL_MATRIX) {
3304     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3305   } else if (reuse == MAT_REUSE_MATRIX) {
3306     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3307   }
3308   B = *newmat;
3309 
3310   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3311   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3312 
3313   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3314     if (B->factortype == MAT_FACTOR_NONE) {
3315       Mat_SeqAIJCUSPARSE *spptr;
3316 
3317       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3318       spptr->format = MAT_CUSPARSE_CSR;
3319       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3320       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3321       B->spptr = spptr;
3322       spptr->deviceMat = NULL;
3323 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3324       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3325       spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3326       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3327 #endif
3328     } else {
3329       Mat_SeqAIJCUSPARSETriFactors *spptr;
3330 
3331       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3332       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3333       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3334       B->spptr = spptr;
3335     }
3336     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3337   }
3338   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3339   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3340   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3341   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3342   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3343 
3344   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3345   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3346   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3347   PetscFunctionReturn(0);
3348 }
3349 
3350 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3351 {
3352   PetscErrorCode ierr;
3353 
3354   PetscFunctionBegin;
3355   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3356   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3357   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3358   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3359   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3360   PetscFunctionReturn(0);
3361 }
3362 
3363 /*MC
3364    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3365 
3366    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3367    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3368    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3369 
3370    Options Database Keys:
3371 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3372 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3373 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3374 
3375   Level: beginner
3376 
3377 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3378 M*/
3379 
3380 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3381 
3382 
3383 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3384 {
3385   PetscErrorCode ierr;
3386 
3387   PetscFunctionBegin;
3388   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3389   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3390   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3391   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3392   PetscFunctionReturn(0);
3393 }
3394 
3395 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3396 {
3397   PetscErrorCode   ierr;
3398   cusparseStatus_t stat;
3399 
3400   PetscFunctionBegin;
3401   if (*cusparsestruct) {
3402     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3403     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3404     delete (*cusparsestruct)->workVector;
3405     delete (*cusparsestruct)->rowoffsets_gpu;
3406     delete (*cusparsestruct)->cooPerm;
3407     delete (*cusparsestruct)->cooPerm_a;
3408     delete (*cusparsestruct)->csr2csc_i;
3409     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3410     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3411   }
3412   PetscFunctionReturn(0);
3413 }
3414 
3415 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3416 {
3417   PetscFunctionBegin;
3418   if (*mat) {
3419     delete (*mat)->values;
3420     delete (*mat)->column_indices;
3421     delete (*mat)->row_offsets;
3422     delete *mat;
3423     *mat = 0;
3424   }
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3429 {
3430   cusparseStatus_t stat;
3431   PetscErrorCode   ierr;
3432 
3433   PetscFunctionBegin;
3434   if (*trifactor) {
3435     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3436     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3437     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3438     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3439     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3440    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3441     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3442    #endif
3443     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3444   }
3445   PetscFunctionReturn(0);
3446 }
3447 
3448 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3449 {
3450   CsrMatrix        *mat;
3451   cusparseStatus_t stat;
3452   cudaError_t      err;
3453 
3454   PetscFunctionBegin;
3455   if (*matstruct) {
3456     if ((*matstruct)->mat) {
3457       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3458        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3459         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3460        #else
3461         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3462         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3463        #endif
3464       } else {
3465         mat = (CsrMatrix*)(*matstruct)->mat;
3466         CsrMatrix_Destroy(&mat);
3467       }
3468     }
3469     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3470     delete (*matstruct)->cprowIndices;
3471     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3472     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3473     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3474 
3475    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3476     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3477     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3478     for (int i=0; i<3; i++) {
3479       if (mdata->cuSpMV[i].initialized) {
3480         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3481         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3482         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3483       }
3484     }
3485    #endif
3486     delete *matstruct;
3487     *matstruct = NULL;
3488   }
3489   PetscFunctionReturn(0);
3490 }
3491 
3492 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3493 {
3494   PetscErrorCode ierr;
3495 
3496   PetscFunctionBegin;
3497   if (*trifactors) {
3498     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3499     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3500     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3501     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3502     delete (*trifactors)->rpermIndices;
3503     delete (*trifactors)->cpermIndices;
3504     delete (*trifactors)->workVector;
3505     (*trifactors)->rpermIndices = NULL;
3506     (*trifactors)->cpermIndices = NULL;
3507     (*trifactors)->workVector = NULL;
3508   }
3509   PetscFunctionReturn(0);
3510 }
3511 
3512 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3513 {
3514   PetscErrorCode   ierr;
3515   cusparseHandle_t handle;
3516   cusparseStatus_t stat;
3517 
3518   PetscFunctionBegin;
3519   if (*trifactors) {
3520     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3521     if (handle = (*trifactors)->handle) {
3522       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3523     }
3524     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3525   }
3526   PetscFunctionReturn(0);
3527 }
3528 
3529 struct IJCompare
3530 {
3531   __host__ __device__
3532   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3533   {
3534     if (t1.get<0>() < t2.get<0>()) return true;
3535     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3536     return false;
3537   }
3538 };
3539 
3540 struct IJEqual
3541 {
3542   __host__ __device__
3543   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3544   {
3545     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3546     return true;
3547   }
3548 };
3549 
3550 struct IJDiff
3551 {
3552   __host__ __device__
3553   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3554   {
3555     return t1 == t2 ? 0 : 1;
3556   }
3557 };
3558 
3559 struct IJSum
3560 {
3561   __host__ __device__
3562   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3563   {
3564     return t1||t2;
3565   }
3566 };
3567 
3568 #include <thrust/iterator/discard_iterator.h>
3569 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3570 {
3571   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3572   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3573   THRUSTARRAY                           *cooPerm_v = NULL;
3574   thrust::device_ptr<const PetscScalar> d_v;
3575   CsrMatrix                             *matrix;
3576   PetscErrorCode                        ierr;
3577   cudaError_t                           cerr;
3578   PetscInt                              n;
3579 
3580   PetscFunctionBegin;
3581   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3582   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3583   if (!cusp->cooPerm) {
3584     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3585     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3586     PetscFunctionReturn(0);
3587   }
3588   matrix = (CsrMatrix*)cusp->mat->mat;
3589   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3590   if (!v) {
3591     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3592     goto finalize;
3593   }
3594   n = cusp->cooPerm->size();
3595   if (isCudaMem(v)) {
3596     d_v = thrust::device_pointer_cast(v);
3597   } else {
3598     cooPerm_v = new THRUSTARRAY(n);
3599     cooPerm_v->assign(v,v+n);
3600     d_v = cooPerm_v->data();
3601     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3602   }
3603   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3604   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3605     if (cusp->cooPerm_a) {
3606       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3607       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3608       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3609       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3610       delete cooPerm_w;
3611     } else {
3612       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3613                                                                 matrix->values->begin()));
3614       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3615                                                                 matrix->values->end()));
3616       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3617     }
3618   } else {
3619     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3620       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3621       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3622     } else {
3623       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3624                                                                 matrix->values->begin()));
3625       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3626                                                                 matrix->values->end()));
3627       thrust::for_each(zibit,zieit,VecCUDAEquals());
3628     }
3629   }
3630   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3631   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3632 finalize:
3633   delete cooPerm_v;
3634   A->offloadmask = PETSC_OFFLOAD_GPU;
3635   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3636   /* shorter version of MatAssemblyEnd_SeqAIJ */
3637   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3638   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3639   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3640   a->reallocs         = 0;
3641   A->info.mallocs    += 0;
3642   A->info.nz_unneeded = 0;
3643   A->assembled = A->was_assembled = PETSC_TRUE;
3644   A->num_ass++;
3645   PetscFunctionReturn(0);
3646 }
3647 
3648 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3649 {
3650   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3651   PetscErrorCode     ierr;
3652 
3653   PetscFunctionBegin;
3654   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3655   if (!cusp) PetscFunctionReturn(0);
3656   if (destroy) {
3657     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3658     delete cusp->csr2csc_i;
3659     cusp->csr2csc_i = NULL;
3660   }
3661   cusp->transupdated = PETSC_FALSE;
3662   PetscFunctionReturn(0);
3663 }
3664 
3665 #include <thrust/binary_search.h>
3666 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3667 {
3668   PetscErrorCode     ierr;
3669   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3670   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3671   PetscInt           cooPerm_n, nzr = 0;
3672   cudaError_t        cerr;
3673 
3674   PetscFunctionBegin;
3675   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3676   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3677   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3678   if (n != cooPerm_n) {
3679     delete cusp->cooPerm;
3680     delete cusp->cooPerm_a;
3681     cusp->cooPerm = NULL;
3682     cusp->cooPerm_a = NULL;
3683   }
3684   if (n) {
3685     THRUSTINTARRAY d_i(n);
3686     THRUSTINTARRAY d_j(n);
3687     THRUSTINTARRAY ii(A->rmap->n);
3688 
3689     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3690     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3691 
3692     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3693     d_i.assign(coo_i,coo_i+n);
3694     d_j.assign(coo_j,coo_j+n);
3695     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3696     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3697 
3698     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3699     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3700     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3701     *cusp->cooPerm_a = d_i;
3702     THRUSTINTARRAY w = d_j;
3703 
3704     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3705     if (nekey == ekey) { /* all entries are unique */
3706       delete cusp->cooPerm_a;
3707       cusp->cooPerm_a = NULL;
3708     } else { /* I couldn't come up with a more elegant algorithm */
3709       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3710       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3711       (*cusp->cooPerm_a)[0] = 0;
3712       w[0] = 0;
3713       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3714       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3715     }
3716     thrust::counting_iterator<PetscInt> search_begin(0);
3717     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3718                         search_begin, search_begin + A->rmap->n,
3719                         ii.begin());
3720     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3721     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3722 
3723     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3724     a->singlemalloc = PETSC_FALSE;
3725     a->free_a       = PETSC_TRUE;
3726     a->free_ij      = PETSC_TRUE;
3727     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3728     a->i[0] = 0;
3729     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3730     a->nz = a->maxnz = a->i[A->rmap->n];
3731     a->rmax = 0;
3732     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3733     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3734     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3735     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3736     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3737     for (PetscInt i = 0; i < A->rmap->n; i++) {
3738       const PetscInt nnzr = a->i[i+1] - a->i[i];
3739       nzr += (PetscInt)!!(nnzr);
3740       a->ilen[i] = a->imax[i] = nnzr;
3741       a->rmax = PetscMax(a->rmax,nnzr);
3742     }
3743     a->nonzerorowcnt = nzr;
3744     A->preallocated = PETSC_TRUE;
3745     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3746     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3747   } else {
3748     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3749   }
3750   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3751 
3752   /* We want to allocate the CUSPARSE struct for matvec now.
3753      The code is so convoluted now that I prefer to copy zeros */
3754   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3755   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3756   A->offloadmask = PETSC_OFFLOAD_CPU;
3757   A->nonzerostate++;
3758   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3759   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3760 
3761   A->assembled = PETSC_FALSE;
3762   A->was_assembled = PETSC_FALSE;
3763   PetscFunctionReturn(0);
3764 }
3765 
3766 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3767 {
3768   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3769   CsrMatrix          *csr;
3770   PetscErrorCode     ierr;
3771 
3772   PetscFunctionBegin;
3773   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3774   PetscValidPointer(a,2);
3775   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3776   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3777   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3778   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3779   csr = (CsrMatrix*)cusp->mat->mat;
3780   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3781   *a = csr->values->data().get();
3782   PetscFunctionReturn(0);
3783 }
3784 
3785 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3786 {
3787   PetscFunctionBegin;
3788   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3789   PetscValidPointer(a,2);
3790   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3791   *a = NULL;
3792   PetscFunctionReturn(0);
3793 }
3794 
3795 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3796 {
3797   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3798   CsrMatrix          *csr;
3799   PetscErrorCode     ierr;
3800 
3801   PetscFunctionBegin;
3802   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3803   PetscValidPointer(a,2);
3804   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3805   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3806   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3807   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3808   csr = (CsrMatrix*)cusp->mat->mat;
3809   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3810   *a = csr->values->data().get();
3811   A->offloadmask = PETSC_OFFLOAD_GPU;
3812   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3813   PetscFunctionReturn(0);
3814 }
3815 
3816 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3817 {
3818   PetscErrorCode ierr;
3819 
3820   PetscFunctionBegin;
3821   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3822   PetscValidPointer(a,2);
3823   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3824   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3825   *a = NULL;
3826   PetscFunctionReturn(0);
3827 }
3828 
3829 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3830 {
3831   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3832   CsrMatrix          *csr;
3833   PetscErrorCode     ierr;
3834 
3835   PetscFunctionBegin;
3836   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3837   PetscValidPointer(a,2);
3838   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3839   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3840   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3841   csr = (CsrMatrix*)cusp->mat->mat;
3842   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3843   *a = csr->values->data().get();
3844   A->offloadmask = PETSC_OFFLOAD_GPU;
3845   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3846   PetscFunctionReturn(0);
3847 }
3848 
3849 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3850 {
3851   PetscErrorCode ierr;
3852 
3853   PetscFunctionBegin;
3854   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3855   PetscValidPointer(a,2);
3856   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3857   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3858   *a = NULL;
3859   PetscFunctionReturn(0);
3860 }
3861 
3862 struct IJCompare4
3863 {
3864   __host__ __device__
3865   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3866   {
3867     if (t1.get<0>() < t2.get<0>()) return true;
3868     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3869     return false;
3870   }
3871 };
3872 
3873 struct Shift
3874 {
3875   int _shift;
3876 
3877   Shift(int shift) : _shift(shift) {}
3878   __host__ __device__
3879   inline int operator() (const int &c)
3880   {
3881     return c + _shift;
3882   }
3883 };
3884 
3885 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3886 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3887 {
3888   PetscErrorCode               ierr;
3889   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3890   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3891   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3892   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3893   PetscInt                     Annz,Bnnz;
3894   cusparseStatus_t             stat;
3895   PetscInt                     i,m,n,zero = 0;
3896   cudaError_t                  cerr;
3897 
3898   PetscFunctionBegin;
3899   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3900   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3901   PetscValidPointer(C,4);
3902   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3903   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3904   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3905   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3906   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3907   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3908   if (reuse == MAT_INITIAL_MATRIX) {
3909     m     = A->rmap->n;
3910     n     = A->cmap->n + B->cmap->n;
3911     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3912     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3913     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3914     c     = (Mat_SeqAIJ*)(*C)->data;
3915     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3916     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3917     Ccsr  = new CsrMatrix;
3918     Cmat->cprowIndices      = NULL;
3919     c->compressedrow.use    = PETSC_FALSE;
3920     c->compressedrow.nrows  = 0;
3921     c->compressedrow.i      = NULL;
3922     c->compressedrow.rindex = NULL;
3923     Ccusp->workVector       = NULL;
3924     Ccusp->nrows    = m;
3925     Ccusp->mat      = Cmat;
3926     Ccusp->mat->mat = Ccsr;
3927     Ccsr->num_rows  = m;
3928     Ccsr->num_cols  = n;
3929     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3930     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3931     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3932     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3933     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3934     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3935     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3937     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3938     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3939     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3940     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3941     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3942     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3943     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3944 
3945     Acsr = (CsrMatrix*)Acusp->mat->mat;
3946     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3947     Annz = (PetscInt)Acsr->column_indices->size();
3948     Bnnz = (PetscInt)Bcsr->column_indices->size();
3949     c->nz = Annz + Bnnz;
3950     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3951     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3952     Ccsr->values = new THRUSTARRAY(c->nz);
3953     Ccsr->num_entries = c->nz;
3954     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3955     if (c->nz) {
3956       auto Acoo = new THRUSTINTARRAY32(Annz);
3957       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3958       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3959       THRUSTINTARRAY32 *Aroff,*Broff;
3960 
3961       if (a->compressedrow.use) { /* need full row offset */
3962         if (!Acusp->rowoffsets_gpu) {
3963           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3964           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3965           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3966         }
3967         Aroff = Acusp->rowoffsets_gpu;
3968       } else Aroff = Acsr->row_offsets;
3969       if (b->compressedrow.use) { /* need full row offset */
3970         if (!Bcusp->rowoffsets_gpu) {
3971           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3972           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3973           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3974         }
3975         Broff = Bcusp->rowoffsets_gpu;
3976       } else Broff = Bcsr->row_offsets;
3977       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3978       stat = cusparseXcsr2coo(Acusp->handle,
3979                               Aroff->data().get(),
3980                               Annz,
3981                               m,
3982                               Acoo->data().get(),
3983                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3984       stat = cusparseXcsr2coo(Bcusp->handle,
3985                               Broff->data().get(),
3986                               Bnnz,
3987                               m,
3988                               Bcoo->data().get(),
3989                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3990       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3991       auto Aperm = thrust::make_constant_iterator(1);
3992       auto Bperm = thrust::make_constant_iterator(0);
3993 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3994       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3995       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3996 #else
3997       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3998       auto Bcib = Bcsr->column_indices->begin();
3999       auto Bcie = Bcsr->column_indices->end();
4000       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4001 #endif
4002       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4003       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4004       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4005       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4006       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4007       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4008       auto p1 = Ccusp->cooPerm->begin();
4009       auto p2 = Ccusp->cooPerm->begin();
4010       thrust::advance(p2,Annz);
4011       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4012 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4013       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4014 #endif
4015       auto cci = thrust::make_counting_iterator(zero);
4016       auto cce = thrust::make_counting_iterator(c->nz);
4017 #if 0 //Errors on SUMMIT cuda 11.1.0
4018       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4019 #else
4020       auto pred = thrust::identity<int>();
4021       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4022       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4023 #endif
4024       stat = cusparseXcoo2csr(Ccusp->handle,
4025                               Ccoo->data().get(),
4026                               c->nz,
4027                               m,
4028                               Ccsr->row_offsets->data().get(),
4029                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4030       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4031       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4032       delete wPerm;
4033       delete Acoo;
4034       delete Bcoo;
4035       delete Ccoo;
4036 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4037       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4038                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4039                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4040                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4041 #endif
4042       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
4043         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4044         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4045         CsrMatrix *CcsrT = new CsrMatrix;
4046         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4047         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4048 
4049         Ccusp->transgen = PETSC_TRUE;
4050         Ccusp->transupdated = PETSC_TRUE;
4051         Ccusp->rowoffsets_gpu = NULL;
4052         CmatT->cprowIndices = NULL;
4053         CmatT->mat = CcsrT;
4054         CcsrT->num_rows = n;
4055         CcsrT->num_cols = m;
4056         CcsrT->num_entries = c->nz;
4057 
4058         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4059         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4060         CcsrT->values = new THRUSTARRAY(c->nz);
4061 
4062         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4063         auto rT = CcsrT->row_offsets->begin();
4064         if (AT) {
4065           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4066           thrust::advance(rT,-1);
4067         }
4068         if (BT) {
4069           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4070           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4071           thrust::copy(titb,tite,rT);
4072         }
4073         auto cT = CcsrT->column_indices->begin();
4074         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4075         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4076         auto vT = CcsrT->values->begin();
4077         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4078         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4079         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4080         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4081 
4082         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4083         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4084         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4085         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4086         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4087         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4088         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4090         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4091 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4092         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4093                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4094                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4095                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4096 #endif
4097         Ccusp->matTranspose = CmatT;
4098       }
4099     }
4100 
4101     c->singlemalloc = PETSC_FALSE;
4102     c->free_a       = PETSC_TRUE;
4103     c->free_ij      = PETSC_TRUE;
4104     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4105     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4106     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4107       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4108       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4109       ii   = *Ccsr->row_offsets;
4110       jj   = *Ccsr->column_indices;
4111       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4112       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4113     } else {
4114       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4115       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4116     }
4117     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4118     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4119     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4120     c->maxnz = c->nz;
4121     c->nonzerorowcnt = 0;
4122     c->rmax = 0;
4123     for (i = 0; i < m; i++) {
4124       const PetscInt nn = c->i[i+1] - c->i[i];
4125       c->ilen[i] = c->imax[i] = nn;
4126       c->nonzerorowcnt += (PetscInt)!!nn;
4127       c->rmax = PetscMax(c->rmax,nn);
4128     }
4129     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4130     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4131     (*C)->nonzerostate++;
4132     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4133     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4134     Ccusp->nonzerostate = (*C)->nonzerostate;
4135     (*C)->preallocated  = PETSC_TRUE;
4136   } else {
4137     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4138     c = (Mat_SeqAIJ*)(*C)->data;
4139     if (c->nz) {
4140       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4141       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4142       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4143       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4144       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4145       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4146       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4147       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4148       Acsr = (CsrMatrix*)Acusp->mat->mat;
4149       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4150       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4151       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4152       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4153       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4154       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4155       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4156       auto pmid = Ccusp->cooPerm->begin();
4157       thrust::advance(pmid,Acsr->num_entries);
4158       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4159       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4160                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4161       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4162                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4163       thrust::for_each(zibait,zieait,VecCUDAEquals());
4164       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4165                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4166       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4167                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4168       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4169       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4170       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4171         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4172         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4173         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4174         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4175         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4176         auto vT = CcsrT->values->begin();
4177         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4178         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4179         Ccusp->transupdated = PETSC_TRUE;
4180       }
4181       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4182       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4183     }
4184   }
4185   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4186   (*C)->assembled     = PETSC_TRUE;
4187   (*C)->was_assembled = PETSC_FALSE;
4188   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4189   PetscFunctionReturn(0);
4190 }
4191 
4192 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4193 {
4194   PetscErrorCode    ierr;
4195   bool              dmem;
4196   const PetscScalar *av;
4197   cudaError_t       cerr;
4198 
4199   PetscFunctionBegin;
4200   dmem = isCudaMem(v);
4201   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4202   if (n && idx) {
4203     THRUSTINTARRAY widx(n);
4204     widx.assign(idx,idx+n);
4205     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4206 
4207     THRUSTARRAY *w = NULL;
4208     thrust::device_ptr<PetscScalar> dv;
4209     if (dmem) {
4210       dv = thrust::device_pointer_cast(v);
4211     } else {
4212       w = new THRUSTARRAY(n);
4213       dv = w->data();
4214     }
4215     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4216 
4217     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4218     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4219     thrust::for_each(zibit,zieit,VecCUDAEquals());
4220     if (w) {
4221       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4222     }
4223     delete w;
4224   } else {
4225     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4226   }
4227   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4228   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4229   PetscFunctionReturn(0);
4230 }
4231