xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 71438e86cf8dea9f708c08733b37fef8eb68dc06)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 
18 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22 
23   typedef enum {
24       CUSPARSE_MV_ALG_DEFAULT = 0,
25       CUSPARSE_COOMV_ALG      = 1,
26       CUSPARSE_CSRMV_ALG1     = 2,
27       CUSPARSE_CSRMV_ALG2     = 3
28   } cusparseSpMVAlg_t;
29 
30   typedef enum {
31       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37       CUSPARSE_SPMM_COO_ALG1    = 1,
38       CUSPARSE_SPMM_COO_ALG2    = 2,
39       CUSPARSE_SPMM_COO_ALG3    = 3,
40       CUSPARSE_SPMM_COO_ALG4    = 5,
41       CUSPARSE_SPMM_CSR_ALG1    = 4,
42       CUSPARSE_SPMM_CSR_ALG2    = 6,
43   } cusparseSpMMAlg_t;
44 
45   typedef enum {
46       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48   } cusparseCsr2CscAlg_t;
49   */
50   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53 #endif
54 
55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58 
59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77 
78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
83 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
84 
85 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
86 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
87 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
88 
89 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
90 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93 
94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95 {
96   cusparseStatus_t   stat;
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   cusparseStatus_t   stat;
109   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110 
111   PetscFunctionBegin;
112   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
113   if (cusparsestruct->handle != handle) {
114     if (cusparsestruct->handle) {
115       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
116     }
117     cusparsestruct->handle = handle;
118   }
119   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120   PetscFunctionReturn(0);
121 }
122 
123 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126   PetscBool          flg;
127   PetscErrorCode     ierr;
128 
129   PetscFunctionBegin;
130   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
131   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133   PetscFunctionReturn(0);
134 }
135 
136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
137 {
138   PetscFunctionBegin;
139   *type = MATSOLVERCUSPARSE;
140   PetscFunctionReturn(0);
141 }
142 
143 /*MC
144   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149   algorithms are not recommended. This class does NOT support direct solver operations.
150 
151   Level: beginner
152 
153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154 M*/
155 
156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
157 {
158   PetscErrorCode ierr;
159   PetscInt       n = A->rmap->n;
160 
161   PetscFunctionBegin;
162   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
164   (*B)->factortype = ftype;
165   (*B)->useordering = PETSC_TRUE;
166   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167 
168   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
169     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
170     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
171     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
176 
177   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
178   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
183 {
184   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
185 
186   PetscFunctionBegin;
187   switch (op) {
188   case MAT_CUSPARSE_MULT:
189     cusparsestruct->format = format;
190     break;
191   case MAT_CUSPARSE_ALL:
192     cusparsestruct->format = format;
193     break;
194   default:
195     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /*@
201    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
202    operation. Only the MatMult operation can use different GPU storage formats
203    for MPIAIJCUSPARSE matrices.
204    Not Collective
205 
206    Input Parameters:
207 +  A - Matrix of type SEQAIJCUSPARSE
208 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
209 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
210 
211    Output Parameter:
212 
213    Level: intermediate
214 
215 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
216 @*/
217 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
223   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
224   PetscFunctionReturn(0);
225 }
226 
227 /*@
228    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
229 
230    Collective on mat
231 
232    Input Parameters:
233 +  A - Matrix of type SEQAIJCUSPARSE
234 -  transgen - the boolean flag
235 
236    Level: intermediate
237 
238 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
239 @*/
240 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
241 {
242   PetscErrorCode ierr;
243   PetscBool      flg;
244 
245   PetscFunctionBegin;
246   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
247   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
248   if (flg) {
249     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
250 
251     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
252     cusp->transgen = transgen;
253     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
254       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
255     }
256   }
257   PetscFunctionReturn(0);
258 }
259 
260 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
261 {
262   PetscErrorCode           ierr;
263   MatCUSPARSEStorageFormat format;
264   PetscBool                flg;
265   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
266 
267   PetscFunctionBegin;
268   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
269   if (A->factortype == MAT_FACTOR_NONE) {
270     PetscBool transgen = cusparsestruct->transgen;
271 
272     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
273     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
274 
275     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
276                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
277     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
278 
279     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
280                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
281     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
282    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
283     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
284     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
285                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
286     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
287     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
288 
289     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
290     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
291                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
292     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
293 
294     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
295     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
296                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
297     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
298    #endif
299   }
300   ierr = PetscOptionsTail();CHKERRQ(ierr);
301   PetscFunctionReturn(0);
302 }
303 
304 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
305 {
306   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
307   PetscErrorCode               ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
311   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
312   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
313   PetscFunctionReturn(0);
314 }
315 
316 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
317 {
318   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
319   PetscErrorCode               ierr;
320 
321   PetscFunctionBegin;
322   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
323   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
324   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
325   PetscFunctionReturn(0);
326 }
327 
328 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
329 {
330   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
331   PetscErrorCode               ierr;
332 
333   PetscFunctionBegin;
334   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
335   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
336   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
341 {
342   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
343   PetscErrorCode               ierr;
344 
345   PetscFunctionBegin;
346   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
347   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
348   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
349   PetscFunctionReturn(0);
350 }
351 
352 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
353 {
354   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
355   PetscInt                          n = A->rmap->n;
356   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
357   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
358   cusparseStatus_t                  stat;
359   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
360   const MatScalar                   *aa = a->a,*v;
361   PetscInt                          *AiLo, *AjLo;
362   PetscInt                          i,nz, nzLower, offset, rowOffset;
363   PetscErrorCode                    ierr;
364   cudaError_t                       cerr;
365 
366   PetscFunctionBegin;
367   if (!n) PetscFunctionReturn(0);
368   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
369     try {
370       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
371       nzLower=n+ai[n]-ai[1];
372       if (!loTriFactor) {
373         PetscScalar                       *AALo;
374 
375         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
376 
377         /* Allocate Space for the lower triangular matrix */
378         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
379         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
380 
381         /* Fill the lower triangular matrix */
382         AiLo[0]  = (PetscInt) 0;
383         AiLo[n]  = nzLower;
384         AjLo[0]  = (PetscInt) 0;
385         AALo[0]  = (MatScalar) 1.0;
386         v        = aa;
387         vi       = aj;
388         offset   = 1;
389         rowOffset= 1;
390         for (i=1; i<n; i++) {
391           nz = ai[i+1] - ai[i];
392           /* additional 1 for the term on the diagonal */
393           AiLo[i]    = rowOffset;
394           rowOffset += nz+1;
395 
396           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
397           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
398 
399           offset      += nz;
400           AjLo[offset] = (PetscInt) i;
401           AALo[offset] = (MatScalar) 1.0;
402           offset      += 1;
403 
404           v  += nz;
405           vi += nz;
406         }
407 
408         /* allocate space for the triangular factor information */
409         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
410         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
411         /* Create the matrix description */
412         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
413         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
414        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
415         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
416        #else
417         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
418        #endif
419         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
420         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
421 
422         /* set the operation */
423         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
424 
425         /* set the matrix */
426         loTriFactor->csrMat = new CsrMatrix;
427         loTriFactor->csrMat->num_rows = n;
428         loTriFactor->csrMat->num_cols = n;
429         loTriFactor->csrMat->num_entries = nzLower;
430 
431         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
432         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
433 
434         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
435         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
436 
437         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
438         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
439 
440         /* Create the solve analysis information */
441         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
442         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
443       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
444         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
445                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
446                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
447                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
448                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
449         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
450       #endif
451 
452         /* perform the solve analysis */
453         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
454                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
455                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
456                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
457                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
459                                #endif
460 );CHKERRCUSPARSE(stat);
461         cerr = WaitForCUDA();CHKERRCUDA(cerr);
462         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
463 
464         /* assign the pointer */
465         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
466         loTriFactor->AA_h = AALo;
467         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
468         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
469         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
470       } else { /* update values only */
471         if (!loTriFactor->AA_h) {
472           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
473         }
474         /* Fill the lower triangular matrix */
475         loTriFactor->AA_h[0]  = 1.0;
476         v        = aa;
477         vi       = aj;
478         offset   = 1;
479         for (i=1; i<n; i++) {
480           nz = ai[i+1] - ai[i];
481           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
482           offset      += nz;
483           loTriFactor->AA_h[offset] = 1.0;
484           offset      += 1;
485           v  += nz;
486         }
487         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
488         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
489       }
490     } catch(char *ex) {
491       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
492     }
493   }
494   PetscFunctionReturn(0);
495 }
496 
497 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
498 {
499   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
500   PetscInt                          n = A->rmap->n;
501   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
503   cusparseStatus_t                  stat;
504   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
505   const MatScalar                   *aa = a->a,*v;
506   PetscInt                          *AiUp, *AjUp;
507   PetscInt                          i,nz, nzUpper, offset;
508   PetscErrorCode                    ierr;
509   cudaError_t                       cerr;
510 
511   PetscFunctionBegin;
512   if (!n) PetscFunctionReturn(0);
513   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
514     try {
515       /* next, figure out the number of nonzeros in the upper triangular matrix. */
516       nzUpper = adiag[0]-adiag[n];
517       if (!upTriFactor) {
518         PetscScalar *AAUp;
519 
520         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
521 
522         /* Allocate Space for the upper triangular matrix */
523         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
524         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
525 
526         /* Fill the upper triangular matrix */
527         AiUp[0]=(PetscInt) 0;
528         AiUp[n]=nzUpper;
529         offset = nzUpper;
530         for (i=n-1; i>=0; i--) {
531           v  = aa + adiag[i+1] + 1;
532           vi = aj + adiag[i+1] + 1;
533 
534           /* number of elements NOT on the diagonal */
535           nz = adiag[i] - adiag[i+1]-1;
536 
537           /* decrement the offset */
538           offset -= (nz+1);
539 
540           /* first, set the diagonal elements */
541           AjUp[offset] = (PetscInt) i;
542           AAUp[offset] = (MatScalar)1./v[nz];
543           AiUp[i]      = AiUp[i+1] - (nz+1);
544 
545           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
546           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
547         }
548 
549         /* allocate space for the triangular factor information */
550         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
551         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
552 
553         /* Create the matrix description */
554         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
555         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
556        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
557         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
558        #else
559         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
560        #endif
561         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
562         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
563 
564         /* set the operation */
565         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
566 
567         /* set the matrix */
568         upTriFactor->csrMat = new CsrMatrix;
569         upTriFactor->csrMat->num_rows = n;
570         upTriFactor->csrMat->num_cols = n;
571         upTriFactor->csrMat->num_entries = nzUpper;
572 
573         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
574         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
575 
576         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
577         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
578 
579         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
580         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
581 
582         /* Create the solve analysis information */
583         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
584         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
585       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
587                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
588                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
589                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
590                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
591         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
592       #endif
593 
594         /* perform the solve analysis */
595         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
596                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
597                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
598                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
599                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
601                                #endif
602 );CHKERRCUSPARSE(stat);
603         cerr = WaitForCUDA();CHKERRCUDA(cerr);
604         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
605 
606         /* assign the pointer */
607         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
608         upTriFactor->AA_h = AAUp;
609         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
610         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
611         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
612       } else {
613         if (!upTriFactor->AA_h) {
614           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
615         }
616         /* Fill the upper triangular matrix */
617         offset = nzUpper;
618         for (i=n-1; i>=0; i--) {
619           v  = aa + adiag[i+1] + 1;
620 
621           /* number of elements NOT on the diagonal */
622           nz = adiag[i] - adiag[i+1]-1;
623 
624           /* decrement the offset */
625           offset -= (nz+1);
626 
627           /* first, set the diagonal elements */
628           upTriFactor->AA_h[offset] = 1./v[nz];
629           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
630         }
631         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
632         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
633       }
634     } catch(char *ex) {
635       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
636     }
637   }
638   PetscFunctionReturn(0);
639 }
640 
641 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
642 {
643   PetscErrorCode               ierr;
644   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
645   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
646   IS                           isrow = a->row,iscol = a->icol;
647   PetscBool                    row_identity,col_identity;
648   PetscInt                     n = A->rmap->n;
649 
650   PetscFunctionBegin;
651   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
652   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
653   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
654 
655   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
656   cusparseTriFactors->nnz=a->nz;
657 
658   A->offloadmask = PETSC_OFFLOAD_BOTH;
659   /* lower triangular indices */
660   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
661   if (!row_identity && !cusparseTriFactors->rpermIndices) {
662     const PetscInt *r;
663 
664     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
665     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
666     cusparseTriFactors->rpermIndices->assign(r, r+n);
667     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
668     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
669   }
670 
671   /* upper triangular indices */
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (!col_identity && !cusparseTriFactors->cpermIndices) {
674     const PetscInt *c;
675 
676     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
677     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
678     cusparseTriFactors->cpermIndices->assign(c, c+n);
679     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
680     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
681   }
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
686 {
687   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
688   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
689   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
690   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
691   cusparseStatus_t                  stat;
692   PetscErrorCode                    ierr;
693   cudaError_t                       cerr;
694   PetscInt                          *AiUp, *AjUp;
695   PetscScalar                       *AAUp;
696   PetscScalar                       *AALo;
697   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
698   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
699   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
700   const MatScalar                   *aa = b->a,*v;
701 
702   PetscFunctionBegin;
703   if (!n) PetscFunctionReturn(0);
704   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
705     try {
706       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
707       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
708       if (!upTriFactor && !loTriFactor) {
709         /* Allocate Space for the upper triangular matrix */
710         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
711         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
712 
713         /* Fill the upper triangular matrix */
714         AiUp[0]=(PetscInt) 0;
715         AiUp[n]=nzUpper;
716         offset = 0;
717         for (i=0; i<n; i++) {
718           /* set the pointers */
719           v  = aa + ai[i];
720           vj = aj + ai[i];
721           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
722 
723           /* first, set the diagonal elements */
724           AjUp[offset] = (PetscInt) i;
725           AAUp[offset] = (MatScalar)1.0/v[nz];
726           AiUp[i]      = offset;
727           AALo[offset] = (MatScalar)1.0/v[nz];
728 
729           offset+=1;
730           if (nz>0) {
731             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
732             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
733             for (j=offset; j<offset+nz; j++) {
734               AAUp[j] = -AAUp[j];
735               AALo[j] = AAUp[j]/v[nz];
736             }
737             offset+=nz;
738           }
739         }
740 
741         /* allocate space for the triangular factor information */
742         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
743         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
744 
745         /* Create the matrix description */
746         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
747         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
748        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
749         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
750        #else
751         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
752        #endif
753         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
754         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
755 
756         /* set the matrix */
757         upTriFactor->csrMat = new CsrMatrix;
758         upTriFactor->csrMat->num_rows = A->rmap->n;
759         upTriFactor->csrMat->num_cols = A->cmap->n;
760         upTriFactor->csrMat->num_entries = a->nz;
761 
762         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
763         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
764 
765         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
766         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
767 
768         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
769         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
770 
771         /* set the operation */
772         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
773 
774         /* Create the solve analysis information */
775         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
776         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
777       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
778         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
779                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
780                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
781                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
782                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
783         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
784       #endif
785 
786         /* perform the solve analysis */
787         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
788                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
789                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
790                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
791                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
793                                 #endif
794 );CHKERRCUSPARSE(stat);
795         cerr = WaitForCUDA();CHKERRCUDA(cerr);
796         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
797 
798         /* assign the pointer */
799         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
800 
801         /* allocate space for the triangular factor information */
802         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
803         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
804 
805         /* Create the matrix description */
806         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
807         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
808        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
810        #else
811         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
812        #endif
813         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
814         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
815 
816         /* set the operation */
817         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
818 
819         /* set the matrix */
820         loTriFactor->csrMat = new CsrMatrix;
821         loTriFactor->csrMat->num_rows = A->rmap->n;
822         loTriFactor->csrMat->num_cols = A->cmap->n;
823         loTriFactor->csrMat->num_entries = a->nz;
824 
825         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
826         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
827 
828         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
829         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
830 
831         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
832         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
833 
834         /* Create the solve analysis information */
835         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
836         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
837       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
838         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
839                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
840                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
841                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
842                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
843         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
844       #endif
845 
846         /* perform the solve analysis */
847         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
848                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
849                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
850                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
851                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
853                                 #endif
854 );CHKERRCUSPARSE(stat);
855         cerr = WaitForCUDA();CHKERRCUDA(cerr);
856         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
857 
858         /* assign the pointer */
859         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
860 
861         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
862         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
863         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
864       } else {
865         /* Fill the upper triangular matrix */
866         offset = 0;
867         for (i=0; i<n; i++) {
868           /* set the pointers */
869           v  = aa + ai[i];
870           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
871 
872           /* first, set the diagonal elements */
873           AAUp[offset] = 1.0/v[nz];
874           AALo[offset] = 1.0/v[nz];
875 
876           offset+=1;
877           if (nz>0) {
878             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
879             for (j=offset; j<offset+nz; j++) {
880               AAUp[j] = -AAUp[j];
881               AALo[j] = AAUp[j]/v[nz];
882             }
883             offset+=nz;
884           }
885         }
886         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
887         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
888         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
889         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
890         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
891       }
892       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
893       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
894     } catch(char *ex) {
895       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
896     }
897   }
898   PetscFunctionReturn(0);
899 }
900 
901 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
902 {
903   PetscErrorCode               ierr;
904   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
905   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
906   IS                           ip = a->row;
907   PetscBool                    perm_identity;
908   PetscInt                     n = A->rmap->n;
909 
910   PetscFunctionBegin;
911   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
913   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
914   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
915 
916   A->offloadmask = PETSC_OFFLOAD_BOTH;
917 
918   /* lower triangular indices */
919   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
920   if (!perm_identity) {
921     IS             iip;
922     const PetscInt *irip,*rip;
923 
924     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
925     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
926     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
927     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
928     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
929     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
930     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
931     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
932     ierr = ISDestroy(&iip);CHKERRQ(ierr);
933     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
934     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
935   }
936   PetscFunctionReturn(0);
937 }
938 
939 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
940 {
941   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
942   IS             isrow = b->row,iscol = b->col;
943   PetscBool      row_identity,col_identity;
944   PetscErrorCode ierr;
945 
946   PetscFunctionBegin;
947   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
948   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
949   B->offloadmask = PETSC_OFFLOAD_CPU;
950   /* determine which version of MatSolve needs to be used. */
951   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
952   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
953   if (row_identity && col_identity) {
954     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
955     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
956     B->ops->matsolve = NULL;
957     B->ops->matsolvetranspose = NULL;
958   } else {
959     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
960     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
961     B->ops->matsolve = NULL;
962     B->ops->matsolvetranspose = NULL;
963   }
964 
965   /* get the triangular factors */
966   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
967   PetscFunctionReturn(0);
968 }
969 
970 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
971 {
972   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
973   IS             ip = b->row;
974   PetscBool      perm_identity;
975   PetscErrorCode ierr;
976 
977   PetscFunctionBegin;
978   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
979   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
980   B->offloadmask = PETSC_OFFLOAD_CPU;
981   /* determine which version of MatSolve needs to be used. */
982   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
983   if (perm_identity) {
984     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
985     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
986     B->ops->matsolve = NULL;
987     B->ops->matsolvetranspose = NULL;
988   } else {
989     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
990     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
991     B->ops->matsolve = NULL;
992     B->ops->matsolvetranspose = NULL;
993   }
994 
995   /* get the triangular factors */
996   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
997   PetscFunctionReturn(0);
998 }
999 
1000 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1001 {
1002   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1004   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1006   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1007   cusparseStatus_t                  stat;
1008   cusparseIndexBase_t               indexBase;
1009   cusparseMatrixType_t              matrixType;
1010   cusparseFillMode_t                fillMode;
1011   cusparseDiagType_t                diagType;
1012   cudaError_t                       cerr;
1013   PetscErrorCode                    ierr;
1014 
1015   PetscFunctionBegin;
1016   /* allocate space for the transpose of the lower triangular factor */
1017   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1018   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1019 
1020   /* set the matrix descriptors of the lower triangular factor */
1021   matrixType = cusparseGetMatType(loTriFactor->descr);
1022   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1023   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1024     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1025   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1026 
1027   /* Create the matrix description */
1028   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1030   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1031   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1032   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1033 
1034   /* set the operation */
1035   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1036 
1037   /* allocate GPU space for the CSC of the lower triangular factor*/
1038   loTriFactorT->csrMat = new CsrMatrix;
1039   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1040   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1041   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1042   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1043   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1044   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1045 
1046   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1047 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1048   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1049                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1050                                        loTriFactor->csrMat->values->data().get(),
1051                                        loTriFactor->csrMat->row_offsets->data().get(),
1052                                        loTriFactor->csrMat->column_indices->data().get(),
1053                                        loTriFactorT->csrMat->values->data().get(),
1054                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1055                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1056                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1057   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1058 #endif
1059 
1060   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1062                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1063                           loTriFactor->csrMat->values->data().get(),
1064                           loTriFactor->csrMat->row_offsets->data().get(),
1065                           loTriFactor->csrMat->column_indices->data().get(),
1066                           loTriFactorT->csrMat->values->data().get(),
1067                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1068                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1069                           CUSPARSE_ACTION_NUMERIC, indexBase,
1070                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1071                         #else
1072                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1073                           CUSPARSE_ACTION_NUMERIC, indexBase
1074                         #endif
1075 );CHKERRCUSPARSE(stat);
1076   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1077   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1078 
1079   /* Create the solve analysis information */
1080   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1081   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1082 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1083   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1084                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1085                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1086                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1087                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1088   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1089 #endif
1090 
1091   /* perform the solve analysis */
1092   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1093                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1094                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1095                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1096                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1097                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1098                           #endif
1099 );CHKERRCUSPARSE(stat);
1100   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1101   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1102 
1103   /* assign the pointer */
1104   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1105 
1106   /*********************************************/
1107   /* Now the Transpose of the Upper Tri Factor */
1108   /*********************************************/
1109 
1110   /* allocate space for the transpose of the upper triangular factor */
1111   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1112   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1113 
1114   /* set the matrix descriptors of the upper triangular factor */
1115   matrixType = cusparseGetMatType(upTriFactor->descr);
1116   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1117   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1118     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1119   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1120 
1121   /* Create the matrix description */
1122   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1124   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1125   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1126   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1127 
1128   /* set the operation */
1129   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1130 
1131   /* allocate GPU space for the CSC of the upper triangular factor*/
1132   upTriFactorT->csrMat = new CsrMatrix;
1133   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1134   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1135   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1136   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1137   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1138   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1139 
1140   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1141 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1142   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1143                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1144                                 upTriFactor->csrMat->values->data().get(),
1145                                 upTriFactor->csrMat->row_offsets->data().get(),
1146                                 upTriFactor->csrMat->column_indices->data().get(),
1147                                 upTriFactorT->csrMat->values->data().get(),
1148                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1149                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1150                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1151   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1152 #endif
1153 
1154   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1156                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1157                           upTriFactor->csrMat->values->data().get(),
1158                           upTriFactor->csrMat->row_offsets->data().get(),
1159                           upTriFactor->csrMat->column_indices->data().get(),
1160                           upTriFactorT->csrMat->values->data().get(),
1161                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1162                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1163                           CUSPARSE_ACTION_NUMERIC, indexBase,
1164                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1165                         #else
1166                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1167                           CUSPARSE_ACTION_NUMERIC, indexBase
1168                         #endif
1169 );CHKERRCUSPARSE(stat);
1170   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1171   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1172 
1173   /* Create the solve analysis information */
1174   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1175   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1176   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1177   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1181                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1182   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1183   #endif
1184 
1185   /* perform the solve analysis */
1186   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1187                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1188                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1189                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1190                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1191                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1192                           #endif
1193 );CHKERRCUSPARSE(stat);
1194   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1195   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1196 
1197   /* assign the pointer */
1198   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 struct PetscScalarToPetscInt
1203 {
1204   __host__ __device__
1205   PetscInt operator()(PetscScalar s)
1206   {
1207     return (PetscInt)PetscRealPart(s);
1208   }
1209 };
1210 
1211 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1212 {
1213   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1214   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1215   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1216   cusparseStatus_t             stat;
1217   cusparseIndexBase_t          indexBase;
1218   cudaError_t                  err;
1219   PetscErrorCode               ierr;
1220 
1221   PetscFunctionBegin;
1222   if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1223   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1224   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1225   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1226   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1227   if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1228   if (cusparsestruct->transupdated) PetscFunctionReturn(0);
1229   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1230   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1231     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1232   }
1233   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1234     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1235     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1236     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1237     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1238     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1239 
1240     /* set alpha and beta */
1241     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1242     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1243     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1244     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1245     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1246     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1247 
1248     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1249       CsrMatrix *matrixT = new CsrMatrix;
1250       matstructT->mat = matrixT;
1251       matrixT->num_rows = A->cmap->n;
1252       matrixT->num_cols = A->rmap->n;
1253       matrixT->num_entries = a->nz;
1254       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1255       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1256       matrixT->values = new THRUSTARRAY(a->nz);
1257 
1258       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1259       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1260 
1261      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262       stat = cusparseCreateCsr(&matstructT->matDescr,
1263                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1264                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1265                                matrixT->values->data().get(),
1266                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1267                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1268      #endif
1269     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1270    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1271       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1272    #else
1273       CsrMatrix *temp  = new CsrMatrix;
1274       CsrMatrix *tempT = new CsrMatrix;
1275       /* First convert HYB to CSR */
1276       temp->num_rows = A->rmap->n;
1277       temp->num_cols = A->cmap->n;
1278       temp->num_entries = a->nz;
1279       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1280       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1281       temp->values = new THRUSTARRAY(a->nz);
1282 
1283       stat = cusparse_hyb2csr(cusparsestruct->handle,
1284                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1285                               temp->values->data().get(),
1286                               temp->row_offsets->data().get(),
1287                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1288 
1289       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1290       tempT->num_rows = A->rmap->n;
1291       tempT->num_cols = A->cmap->n;
1292       tempT->num_entries = a->nz;
1293       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1294       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1295       tempT->values = new THRUSTARRAY(a->nz);
1296 
1297       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1298                               temp->num_cols, temp->num_entries,
1299                               temp->values->data().get(),
1300                               temp->row_offsets->data().get(),
1301                               temp->column_indices->data().get(),
1302                               tempT->values->data().get(),
1303                               tempT->column_indices->data().get(),
1304                               tempT->row_offsets->data().get(),
1305                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1306 
1307       /* Last, convert CSC to HYB */
1308       cusparseHybMat_t hybMat;
1309       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1310       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1311         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1312       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1313                               matstructT->descr, tempT->values->data().get(),
1314                               tempT->row_offsets->data().get(),
1315                               tempT->column_indices->data().get(),
1316                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1317 
1318       /* assign the pointer */
1319       matstructT->mat = hybMat;
1320       cusparsestruct->transupdated = PETSC_TRUE;
1321       /* delete temporaries */
1322       if (tempT) {
1323         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1324         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1325         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1326         delete (CsrMatrix*) tempT;
1327       }
1328       if (temp) {
1329         if (temp->values) delete (THRUSTARRAY*) temp->values;
1330         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1331         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1332         delete (CsrMatrix*) temp;
1333       }
1334      #endif
1335     }
1336   }
1337   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1338     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1339     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1340     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1341     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1342     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1343     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1344     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1345     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1346     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1347     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1348     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1349       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1350       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1351       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1352     }
1353     if (!cusparsestruct->csr2csc_i) {
1354       THRUSTARRAY csr2csc_a(matrix->num_entries);
1355       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1356 
1357       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1358      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1359       void   *csr2cscBuffer;
1360       size_t csr2cscBufferSize;
1361       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1362                                            A->cmap->n, matrix->num_entries,
1363                                            matrix->values->data().get(),
1364                                            cusparsestruct->rowoffsets_gpu->data().get(),
1365                                            matrix->column_indices->data().get(),
1366                                            matrixT->values->data().get(),
1367                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1368                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1369                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1370       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1371      #endif
1372 
1373       stat = cusparse_csr2csc(cusparsestruct->handle,
1374                               A->rmap->n,A->cmap->n,matrix->num_entries,
1375                               csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(),
1376                               matrixT->values->data().get(),
1377                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1379                               CUSPARSE_ACTION_NUMERIC,indexBase,
1380                               cusparsestruct->csr2cscAlg, csr2cscBuffer
1381                              #else
1382                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1383                               CUSPARSE_ACTION_NUMERIC, indexBase
1384                              #endif
1385 );CHKERRCUSPARSE(stat);
1386       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1387       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1388      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1390      #endif
1391     }
1392     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1393                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1394                                                      matrixT->values->begin()));
1395   }
1396   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1397   /* the compressed row indices is not used for matTranspose */
1398   matstructT->cprowIndices = NULL;
1399   /* assign the pointer */
1400   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1401   cusparsestruct->transupdated = PETSC_TRUE;
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1406 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1407 {
1408   PetscInt                              n = xx->map->n;
1409   const PetscScalar                     *barray;
1410   PetscScalar                           *xarray;
1411   thrust::device_ptr<const PetscScalar> bGPU;
1412   thrust::device_ptr<PetscScalar>       xGPU;
1413   cusparseStatus_t                      stat;
1414   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1415   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1416   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1417   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1418   PetscErrorCode                        ierr;
1419   cudaError_t                           cerr;
1420 
1421   PetscFunctionBegin;
1422   /* Analyze the matrix and create the transpose ... on the fly */
1423   if (!loTriFactorT && !upTriFactorT) {
1424     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1425     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427   }
1428 
1429   /* Get the GPU pointers */
1430   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1431   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1432   xGPU = thrust::device_pointer_cast(xarray);
1433   bGPU = thrust::device_pointer_cast(barray);
1434 
1435   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1436   /* First, reorder with the row permutation */
1437   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1438                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1439                xGPU);
1440 
1441   /* First, solve U */
1442   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1443                         upTriFactorT->csrMat->num_rows,
1444                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1445                         upTriFactorT->csrMat->num_entries,
1446                       #endif
1447                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1448                         upTriFactorT->csrMat->values->data().get(),
1449                         upTriFactorT->csrMat->row_offsets->data().get(),
1450                         upTriFactorT->csrMat->column_indices->data().get(),
1451                         upTriFactorT->solveInfo,
1452                         xarray, tempGPU->data().get()
1453                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1454                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1455                       #endif
1456 );CHKERRCUSPARSE(stat);
1457 
1458   /* Then, solve L */
1459   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1460                         loTriFactorT->csrMat->num_rows,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         loTriFactorT->csrMat->num_entries,
1463                       #endif
1464                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1465                         loTriFactorT->csrMat->values->data().get(),
1466                         loTriFactorT->csrMat->row_offsets->data().get(),
1467                         loTriFactorT->csrMat->column_indices->data().get(),
1468                         loTriFactorT->solveInfo,
1469                         tempGPU->data().get(), xarray
1470                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1471                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1472                       #endif
1473 );CHKERRCUSPARSE(stat);
1474 
1475   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1476   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1477                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1478                tempGPU->begin());
1479 
1480   /* Copy the temporary to the full solution. */
1481   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1482 
1483   /* restore */
1484   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1485   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1486   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1487   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1488   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1493 {
1494   const PetscScalar                 *barray;
1495   PetscScalar                       *xarray;
1496   cusparseStatus_t                  stat;
1497   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1498   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1499   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1500   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1501   PetscErrorCode                    ierr;
1502   cudaError_t                       cerr;
1503 
1504   PetscFunctionBegin;
1505   /* Analyze the matrix and create the transpose ... on the fly */
1506   if (!loTriFactorT && !upTriFactorT) {
1507     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1508     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510   }
1511 
1512   /* Get the GPU pointers */
1513   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1514   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1515 
1516   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1517   /* First, solve U */
1518   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1519                         upTriFactorT->csrMat->num_rows,
1520                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1521                         upTriFactorT->csrMat->num_entries,
1522                       #endif
1523                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1524                         upTriFactorT->csrMat->values->data().get(),
1525                         upTriFactorT->csrMat->row_offsets->data().get(),
1526                         upTriFactorT->csrMat->column_indices->data().get(),
1527                         upTriFactorT->solveInfo,
1528                         barray, tempGPU->data().get()
1529                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1530                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1531                       #endif
1532 );CHKERRCUSPARSE(stat);
1533 
1534   /* Then, solve L */
1535   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1536                         loTriFactorT->csrMat->num_rows,
1537                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1538                         loTriFactorT->csrMat->num_entries,
1539                       #endif
1540                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1541                         loTriFactorT->csrMat->values->data().get(),
1542                         loTriFactorT->csrMat->row_offsets->data().get(),
1543                         loTriFactorT->csrMat->column_indices->data().get(),
1544                         loTriFactorT->solveInfo,
1545                         tempGPU->data().get(), xarray
1546                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1548                       #endif
1549 );CHKERRCUSPARSE(stat);
1550 
1551   /* restore */
1552   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1553   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1554   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1555   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1556   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1561 {
1562   const PetscScalar                     *barray;
1563   PetscScalar                           *xarray;
1564   thrust::device_ptr<const PetscScalar> bGPU;
1565   thrust::device_ptr<PetscScalar>       xGPU;
1566   cusparseStatus_t                      stat;
1567   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1568   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1569   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1570   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1571   PetscErrorCode                        ierr;
1572   cudaError_t                           cerr;
1573 
1574   PetscFunctionBegin;
1575 
1576   /* Get the GPU pointers */
1577   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1578   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1579   xGPU = thrust::device_pointer_cast(xarray);
1580   bGPU = thrust::device_pointer_cast(barray);
1581 
1582   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1583   /* First, reorder with the row permutation */
1584   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1585                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1586                tempGPU->begin());
1587 
1588   /* Next, solve L */
1589   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1590                         loTriFactor->csrMat->num_rows,
1591                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1592                         loTriFactor->csrMat->num_entries,
1593                       #endif
1594                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1595                         loTriFactor->csrMat->values->data().get(),
1596                         loTriFactor->csrMat->row_offsets->data().get(),
1597                         loTriFactor->csrMat->column_indices->data().get(),
1598                         loTriFactor->solveInfo,
1599                         tempGPU->data().get(), xarray
1600                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1602                       #endif
1603 );CHKERRCUSPARSE(stat);
1604 
1605   /* Then, solve U */
1606   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1607                         upTriFactor->csrMat->num_rows,
1608                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1609                         upTriFactor->csrMat->num_entries,
1610                       #endif
1611                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1612                         upTriFactor->csrMat->values->data().get(),
1613                         upTriFactor->csrMat->row_offsets->data().get(),
1614                         upTriFactor->csrMat->column_indices->data().get(),
1615                         upTriFactor->solveInfo,
1616                         xarray, tempGPU->data().get()
1617                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1618                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1619                       #endif
1620 );CHKERRCUSPARSE(stat);
1621 
1622   /* Last, reorder with the column permutation */
1623   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1624                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1625                xGPU);
1626 
1627   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1628   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1629   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1630   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1631   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1632   PetscFunctionReturn(0);
1633 }
1634 
1635 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1636 {
1637   const PetscScalar                 *barray;
1638   PetscScalar                       *xarray;
1639   cusparseStatus_t                  stat;
1640   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1641   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1642   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1643   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1644   PetscErrorCode                    ierr;
1645   cudaError_t                       cerr;
1646 
1647   PetscFunctionBegin;
1648   /* Get the GPU pointers */
1649   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1650   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1651 
1652   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1653   /* First, solve L */
1654   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1655                         loTriFactor->csrMat->num_rows,
1656                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1657                         loTriFactor->csrMat->num_entries,
1658                       #endif
1659                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1660                         loTriFactor->csrMat->values->data().get(),
1661                         loTriFactor->csrMat->row_offsets->data().get(),
1662                         loTriFactor->csrMat->column_indices->data().get(),
1663                         loTriFactor->solveInfo,
1664                         barray, tempGPU->data().get()
1665                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1666                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1667                       #endif
1668 );CHKERRCUSPARSE(stat);
1669 
1670   /* Next, solve U */
1671   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1672                         upTriFactor->csrMat->num_rows,
1673                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1674                         upTriFactor->csrMat->num_entries,
1675                       #endif
1676                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1677                         upTriFactor->csrMat->values->data().get(),
1678                         upTriFactor->csrMat->row_offsets->data().get(),
1679                         upTriFactor->csrMat->column_indices->data().get(),
1680                         upTriFactor->solveInfo,
1681                         tempGPU->data().get(), xarray
1682                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1683                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1684                       #endif
1685 );CHKERRCUSPARSE(stat);
1686 
1687   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1688   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1689   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1690   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1691   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1696 {
1697   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1698   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1699   cudaError_t        cerr;
1700   PetscErrorCode     ierr;
1701 
1702   PetscFunctionBegin;
1703   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1704     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1705 
1706     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1707     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1708     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1709     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1710     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1711     A->offloadmask = PETSC_OFFLOAD_BOTH;
1712   }
1713   PetscFunctionReturn(0);
1714 }
1715 
1716 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1717 {
1718   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1719   PetscErrorCode ierr;
1720 
1721   PetscFunctionBegin;
1722   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1723   *array = a->a;
1724   A->offloadmask = PETSC_OFFLOAD_CPU;
1725   PetscFunctionReturn(0);
1726 }
1727 
1728 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1729 {
1730   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1731   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1732   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1733   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1734   PetscErrorCode               ierr;
1735   cusparseStatus_t             stat;
1736   PetscBool                    both = PETSC_TRUE;
1737   cudaError_t                  err;
1738 
1739   PetscFunctionBegin;
1740   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1741   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1742     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1743       CsrMatrix *matrix;
1744       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1745 
1746       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1747       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1748       matrix->values->assign(a->a, a->a+a->nz);
1749       err  = WaitForCUDA();CHKERRCUDA(err);
1750       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1751       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1752       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1753     } else {
1754       PetscInt nnz;
1755       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1756       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1757       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1758       delete cusparsestruct->workVector;
1759       delete cusparsestruct->rowoffsets_gpu;
1760       cusparsestruct->workVector = NULL;
1761       cusparsestruct->rowoffsets_gpu = NULL;
1762       try {
1763         if (a->compressedrow.use) {
1764           m    = a->compressedrow.nrows;
1765           ii   = a->compressedrow.i;
1766           ridx = a->compressedrow.rindex;
1767         } else {
1768           m    = A->rmap->n;
1769           ii   = a->i;
1770           ridx = NULL;
1771         }
1772         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1773         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1774         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1775         else nnz = a->nz;
1776 
1777         /* create cusparse matrix */
1778         cusparsestruct->nrows = m;
1779         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1780         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1781         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1782         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1783 
1784         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1785         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1786         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1787         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1788         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1789         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1790         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1791 
1792         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1793         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1794           /* set the matrix */
1795           CsrMatrix *mat= new CsrMatrix;
1796           mat->num_rows = m;
1797           mat->num_cols = A->cmap->n;
1798           mat->num_entries = nnz;
1799           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1800           mat->row_offsets->assign(ii, ii + m+1);
1801 
1802           mat->column_indices = new THRUSTINTARRAY32(nnz);
1803           mat->column_indices->assign(a->j, a->j+nnz);
1804 
1805           mat->values = new THRUSTARRAY(nnz);
1806           if (a->a) mat->values->assign(a->a, a->a+nnz);
1807 
1808           /* assign the pointer */
1809           matstruct->mat = mat;
1810          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1811           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1812             stat = cusparseCreateCsr(&matstruct->matDescr,
1813                                     mat->num_rows, mat->num_cols, mat->num_entries,
1814                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1815                                     mat->values->data().get(),
1816                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1817                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1818           }
1819          #endif
1820         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1821          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1822           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1823          #else
1824           CsrMatrix *mat= new CsrMatrix;
1825           mat->num_rows = m;
1826           mat->num_cols = A->cmap->n;
1827           mat->num_entries = nnz;
1828           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1829           mat->row_offsets->assign(ii, ii + m+1);
1830 
1831           mat->column_indices = new THRUSTINTARRAY32(nnz);
1832           mat->column_indices->assign(a->j, a->j+nnz);
1833 
1834           mat->values = new THRUSTARRAY(nnz);
1835           if (a->a) mat->values->assign(a->a, a->a+nnz);
1836 
1837           cusparseHybMat_t hybMat;
1838           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1839           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1840             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1841           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1842               matstruct->descr, mat->values->data().get(),
1843               mat->row_offsets->data().get(),
1844               mat->column_indices->data().get(),
1845               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1846           /* assign the pointer */
1847           matstruct->mat = hybMat;
1848 
1849           if (mat) {
1850             if (mat->values) delete (THRUSTARRAY*)mat->values;
1851             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1852             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1853             delete (CsrMatrix*)mat;
1854           }
1855          #endif
1856         }
1857 
1858         /* assign the compressed row indices */
1859         if (a->compressedrow.use) {
1860           cusparsestruct->workVector = new THRUSTARRAY(m);
1861           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1862           matstruct->cprowIndices->assign(ridx,ridx+m);
1863           tmp = m;
1864         } else {
1865           cusparsestruct->workVector = NULL;
1866           matstruct->cprowIndices    = NULL;
1867           tmp = 0;
1868         }
1869         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1870 
1871         /* assign the pointer */
1872         cusparsestruct->mat = matstruct;
1873       } catch(char *ex) {
1874         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1875       }
1876       err  = WaitForCUDA();CHKERRCUDA(err);
1877       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1878       cusparsestruct->nonzerostate = A->nonzerostate;
1879     }
1880     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1881   }
1882   PetscFunctionReturn(0);
1883 }
1884 
1885 struct VecCUDAPlusEquals
1886 {
1887   template <typename Tuple>
1888   __host__ __device__
1889   void operator()(Tuple t)
1890   {
1891     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1892   }
1893 };
1894 
1895 struct VecCUDAEquals
1896 {
1897   template <typename Tuple>
1898   __host__ __device__
1899   void operator()(Tuple t)
1900   {
1901     thrust::get<1>(t) = thrust::get<0>(t);
1902   }
1903 };
1904 
1905 struct VecCUDAEqualsReverse
1906 {
1907   template <typename Tuple>
1908   __host__ __device__
1909   void operator()(Tuple t)
1910   {
1911     thrust::get<0>(t) = thrust::get<1>(t);
1912   }
1913 };
1914 
1915 struct MatMatCusparse {
1916   PetscBool             cisdense;
1917   PetscScalar           *Bt;
1918   Mat                   X;
1919   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1920   PetscLogDouble        flops;
1921   CsrMatrix             *Bcsr;
1922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1923   cusparseSpMatDescr_t  matSpBDescr;
1924   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1925   cusparseDnMatDescr_t  matBDescr;
1926   cusparseDnMatDescr_t  matCDescr;
1927   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1928   size_t                mmBufferSize;
1929   void                  *mmBuffer;
1930   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1931   cusparseSpGEMMDescr_t spgemmDesc;
1932 #endif
1933 };
1934 
1935 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1936 {
1937   PetscErrorCode   ierr;
1938   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1939   cudaError_t      cerr;
1940  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1941   cusparseStatus_t stat;
1942  #endif
1943 
1944   PetscFunctionBegin;
1945   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1946   delete mmdata->Bcsr;
1947  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1948   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1949   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1950   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1951   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1952   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1953   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1954  #endif
1955   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1956   ierr = PetscFree(data);CHKERRQ(ierr);
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1961 
1962 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1963 {
1964   Mat_Product                  *product = C->product;
1965   Mat                          A,B;
1966   PetscInt                     m,n,blda,clda;
1967   PetscBool                    flg,biscuda;
1968   Mat_SeqAIJCUSPARSE           *cusp;
1969   cusparseStatus_t             stat;
1970   cusparseOperation_t          opA;
1971   const PetscScalar            *barray;
1972   PetscScalar                  *carray;
1973   PetscErrorCode               ierr;
1974   MatMatCusparse               *mmdata;
1975   Mat_SeqAIJCUSPARSEMultStruct *mat;
1976   CsrMatrix                    *csrmat;
1977   cudaError_t                  cerr;
1978 
1979   PetscFunctionBegin;
1980   MatCheckProduct(C,1);
1981   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1982   mmdata = (MatMatCusparse*)product->data;
1983   A    = product->A;
1984   B    = product->B;
1985   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1986   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1987   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1988      Instead of silently accepting the wrong answer, I prefer to raise the error */
1989   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1990   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1991   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1992   switch (product->type) {
1993   case MATPRODUCT_AB:
1994   case MATPRODUCT_PtAP:
1995     mat = cusp->mat;
1996     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1997     m   = A->rmap->n;
1998     n   = B->cmap->n;
1999     break;
2000   case MATPRODUCT_AtB:
2001     if (!cusp->transgen) {
2002       mat = cusp->mat;
2003       opA = CUSPARSE_OPERATION_TRANSPOSE;
2004     } else {
2005       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2006       mat  = cusp->matTranspose;
2007       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2008     }
2009     m = A->cmap->n;
2010     n = B->cmap->n;
2011     break;
2012   case MATPRODUCT_ABt:
2013   case MATPRODUCT_RARt:
2014     mat = cusp->mat;
2015     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2016     m   = A->rmap->n;
2017     n   = B->rmap->n;
2018     break;
2019   default:
2020     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2021   }
2022   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2023   csrmat = (CsrMatrix*)mat->mat;
2024   /* if the user passed a CPU matrix, copy the data to the GPU */
2025   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2026   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2027   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2028 
2029   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2030   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2031     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2032     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2033   } else {
2034     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2035     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2036   }
2037 
2038   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2039  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2040   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2041   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2042   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2043     size_t mmBufferSize;
2044     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2045     if (!mmdata->matBDescr) {
2046       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2047       mmdata->Blda = blda;
2048     }
2049 
2050     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2051     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2052       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2053       mmdata->Clda = clda;
2054     }
2055 
2056     if (!mat->matDescr) {
2057       stat = cusparseCreateCsr(&mat->matDescr,
2058                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2059                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2060                                csrmat->values->data().get(),
2061                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2062                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2063     }
2064     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2065                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2066                                    mmdata->matCDescr,cusparse_scalartype,
2067                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2068     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2069       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2070       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2071       mmdata->mmBufferSize = mmBufferSize;
2072     }
2073     mmdata->initialized = PETSC_TRUE;
2074   } else {
2075     /* to be safe, always update pointers of the mats */
2076     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2077     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2078     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2079   }
2080 
2081   /* do cusparseSpMM, which supports transpose on B */
2082   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2083                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2084                       mmdata->matCDescr,cusparse_scalartype,
2085                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2086  #else
2087   PetscInt k;
2088   /* cusparseXcsrmm does not support transpose on B */
2089   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2090     cublasHandle_t cublasv2handle;
2091     cublasStatus_t cerr;
2092 
2093     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2094     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2095                        B->cmap->n,B->rmap->n,
2096                        &PETSC_CUSPARSE_ONE ,barray,blda,
2097                        &PETSC_CUSPARSE_ZERO,barray,blda,
2098                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2099     blda = B->cmap->n;
2100     k    = B->cmap->n;
2101   } else {
2102     k    = B->rmap->n;
2103   }
2104 
2105   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2106   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2107                            csrmat->num_entries,mat->alpha_one,mat->descr,
2108                            csrmat->values->data().get(),
2109                            csrmat->row_offsets->data().get(),
2110                            csrmat->column_indices->data().get(),
2111                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2112                            carray,clda);CHKERRCUSPARSE(stat);
2113  #endif
2114   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2115   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2116   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2117   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2118   if (product->type == MATPRODUCT_RARt) {
2119     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2120     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2121   } else if (product->type == MATPRODUCT_PtAP) {
2122     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2123     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2124   } else {
2125     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2126   }
2127   if (mmdata->cisdense) {
2128     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2129   }
2130   if (!biscuda) {
2131     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2132   }
2133   PetscFunctionReturn(0);
2134 }
2135 
2136 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2137 {
2138   Mat_Product        *product = C->product;
2139   Mat                A,B;
2140   PetscInt           m,n;
2141   PetscBool          cisdense,flg;
2142   PetscErrorCode     ierr;
2143   MatMatCusparse     *mmdata;
2144   Mat_SeqAIJCUSPARSE *cusp;
2145 
2146   PetscFunctionBegin;
2147   MatCheckProduct(C,1);
2148   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2149   A    = product->A;
2150   B    = product->B;
2151   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2152   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2153   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2154   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2155   switch (product->type) {
2156   case MATPRODUCT_AB:
2157     m = A->rmap->n;
2158     n = B->cmap->n;
2159     break;
2160   case MATPRODUCT_AtB:
2161     m = A->cmap->n;
2162     n = B->cmap->n;
2163     break;
2164   case MATPRODUCT_ABt:
2165     m = A->rmap->n;
2166     n = B->rmap->n;
2167     break;
2168   case MATPRODUCT_PtAP:
2169     m = B->cmap->n;
2170     n = B->cmap->n;
2171     break;
2172   case MATPRODUCT_RARt:
2173     m = B->rmap->n;
2174     n = B->rmap->n;
2175     break;
2176   default:
2177     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2178   }
2179   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2180   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2181   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2182   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2183 
2184   /* product data */
2185   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2186   mmdata->cisdense = cisdense;
2187  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2188   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2189   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2190     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2191   }
2192  #endif
2193   /* for these products we need intermediate storage */
2194   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2195     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2196     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2197     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2198       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2199     } else {
2200       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2201     }
2202   }
2203   C->product->data    = mmdata;
2204   C->product->destroy = MatDestroy_MatMatCusparse;
2205 
2206   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2207   PetscFunctionReturn(0);
2208 }
2209 
2210 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2211 {
2212   Mat_Product                  *product = C->product;
2213   Mat                          A,B;
2214   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2215   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2216   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2217   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2218   PetscBool                    flg;
2219   PetscErrorCode               ierr;
2220   cusparseStatus_t             stat;
2221   cudaError_t                  cerr;
2222   MatProductType               ptype;
2223   MatMatCusparse               *mmdata;
2224 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2225   cusparseSpMatDescr_t         BmatSpDescr;
2226 #endif
2227 
2228   PetscFunctionBegin;
2229   MatCheckProduct(C,1);
2230   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2231   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2232   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2233   mmdata = (MatMatCusparse*)C->product->data;
2234   A = product->A;
2235   B = product->B;
2236   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2237     mmdata->reusesym = PETSC_FALSE;
2238     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2239     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2240     Cmat = Ccusp->mat;
2241     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2242     Ccsr = (CsrMatrix*)Cmat->mat;
2243     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2244     goto finalize;
2245   }
2246   if (!c->nz) goto finalize;
2247   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2248   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2249   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2250   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2251   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2253   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2254   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2255   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2256   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2257   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2258   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2259   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2260   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2261 
2262   ptype = product->type;
2263   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2264   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2265   switch (ptype) {
2266   case MATPRODUCT_AB:
2267     Amat = Acusp->mat;
2268     Bmat = Bcusp->mat;
2269     break;
2270   case MATPRODUCT_AtB:
2271     Amat = Acusp->matTranspose;
2272     Bmat = Bcusp->mat;
2273     break;
2274   case MATPRODUCT_ABt:
2275     Amat = Acusp->mat;
2276     Bmat = Bcusp->matTranspose;
2277     break;
2278   default:
2279     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2280   }
2281   Cmat = Ccusp->mat;
2282   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2283   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2284   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2285   Acsr = (CsrMatrix*)Amat->mat;
2286   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2287   Ccsr = (CsrMatrix*)Cmat->mat;
2288   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2289   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2290   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2291   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2292 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2293   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2294   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2295                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2296                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2297                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2298   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2299                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2300                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2301 #else
2302   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2303                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2304                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2305                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2306                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2307 #endif
2308   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2309   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2310   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2311   C->offloadmask = PETSC_OFFLOAD_GPU;
2312 finalize:
2313   /* shorter version of MatAssemblyEnd_SeqAIJ */
2314   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2315   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2316   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2317   c->reallocs         = 0;
2318   C->info.mallocs    += 0;
2319   C->info.nz_unneeded = 0;
2320   C->assembled = C->was_assembled = PETSC_TRUE;
2321   C->num_ass++;
2322   PetscFunctionReturn(0);
2323 }
2324 
2325 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2326 {
2327   Mat_Product                  *product = C->product;
2328   Mat                          A,B;
2329   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2330   Mat_SeqAIJ                   *a,*b,*c;
2331   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2332   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2333   PetscInt                     i,j,m,n,k;
2334   PetscBool                    flg;
2335   PetscErrorCode               ierr;
2336   cusparseStatus_t             stat;
2337   cudaError_t                  cerr;
2338   MatProductType               ptype;
2339   MatMatCusparse               *mmdata;
2340   PetscLogDouble               flops;
2341   PetscBool                    biscompressed,ciscompressed;
2342 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2343   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2344   size_t                       bufSize2;
2345   cusparseSpMatDescr_t         BmatSpDescr;
2346 #else
2347   int                          cnz;
2348 #endif
2349 
2350   PetscFunctionBegin;
2351   MatCheckProduct(C,1);
2352   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2353   A    = product->A;
2354   B    = product->B;
2355   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2356   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2357   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2358   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2359   a = (Mat_SeqAIJ*)A->data;
2360   b = (Mat_SeqAIJ*)B->data;
2361   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2362   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2363   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2364   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2365 
2366   /* product data */
2367   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2368   C->product->data    = mmdata;
2369   C->product->destroy = MatDestroy_MatMatCusparse;
2370 
2371   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2372   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2373   ptype = product->type;
2374   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2375   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2376   biscompressed = PETSC_FALSE;
2377   ciscompressed = PETSC_FALSE;
2378   switch (ptype) {
2379   case MATPRODUCT_AB:
2380     m = A->rmap->n;
2381     n = B->cmap->n;
2382     k = A->cmap->n;
2383     Amat = Acusp->mat;
2384     Bmat = Bcusp->mat;
2385     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2386     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2387     break;
2388   case MATPRODUCT_AtB:
2389     m = A->cmap->n;
2390     n = B->cmap->n;
2391     k = A->rmap->n;
2392     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2393     Amat = Acusp->matTranspose;
2394     Bmat = Bcusp->mat;
2395     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2396     break;
2397   case MATPRODUCT_ABt:
2398     m = A->rmap->n;
2399     n = B->rmap->n;
2400     k = A->cmap->n;
2401     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2402     Amat = Acusp->mat;
2403     Bmat = Bcusp->matTranspose;
2404     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2405     break;
2406   default:
2407     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2408   }
2409 
2410   /* create cusparse matrix */
2411   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2412   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2413   c     = (Mat_SeqAIJ*)C->data;
2414   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2415   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2416   Ccsr  = new CsrMatrix;
2417 
2418   c->compressedrow.use = ciscompressed;
2419   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2420     c->compressedrow.nrows = a->compressedrow.nrows;
2421     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2422     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2423     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2424     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2425     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2426   } else {
2427     c->compressedrow.nrows  = 0;
2428     c->compressedrow.i      = NULL;
2429     c->compressedrow.rindex = NULL;
2430     Ccusp->workVector       = NULL;
2431     Cmat->cprowIndices      = NULL;
2432   }
2433   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2434   Ccusp->mat      = Cmat;
2435   Ccusp->mat->mat = Ccsr;
2436   Ccsr->num_rows    = Ccusp->nrows;
2437   Ccsr->num_cols    = n;
2438   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2439   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2440   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2441   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2442   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2443   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2444   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2445   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2446   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2447   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2448   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2449     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2450     c->nz = 0;
2451     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2452     Ccsr->values = new THRUSTARRAY(c->nz);
2453     goto finalizesym;
2454   }
2455 
2456   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2457   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2458   Acsr = (CsrMatrix*)Amat->mat;
2459   if (!biscompressed) {
2460     Bcsr = (CsrMatrix*)Bmat->mat;
2461 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2462     BmatSpDescr = Bmat->matDescr;
2463 #endif
2464   } else { /* we need to use row offsets for the full matrix */
2465     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2466     Bcsr = new CsrMatrix;
2467     Bcsr->num_rows       = B->rmap->n;
2468     Bcsr->num_cols       = cBcsr->num_cols;
2469     Bcsr->num_entries    = cBcsr->num_entries;
2470     Bcsr->column_indices = cBcsr->column_indices;
2471     Bcsr->values         = cBcsr->values;
2472     if (!Bcusp->rowoffsets_gpu) {
2473       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2474       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2475       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2476     }
2477     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2478     mmdata->Bcsr = Bcsr;
2479 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2480     if (Bcsr->num_rows && Bcsr->num_cols) {
2481       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2482                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2483                                Bcsr->values->data().get(),
2484                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2485                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2486     }
2487     BmatSpDescr = mmdata->matSpBDescr;
2488 #endif
2489   }
2490   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2491   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2492   /* precompute flops count */
2493   if (ptype == MATPRODUCT_AB) {
2494     for (i=0, flops = 0; i<A->rmap->n; i++) {
2495       const PetscInt st = a->i[i];
2496       const PetscInt en = a->i[i+1];
2497       for (j=st; j<en; j++) {
2498         const PetscInt brow = a->j[j];
2499         flops += 2.*(b->i[brow+1] - b->i[brow]);
2500       }
2501     }
2502   } else if (ptype == MATPRODUCT_AtB) {
2503     for (i=0, flops = 0; i<A->rmap->n; i++) {
2504       const PetscInt anzi = a->i[i+1] - a->i[i];
2505       const PetscInt bnzi = b->i[i+1] - b->i[i];
2506       flops += (2.*anzi)*bnzi;
2507     }
2508   } else { /* TODO */
2509     flops = 0.;
2510   }
2511 
2512   mmdata->flops = flops;
2513   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2515   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2516   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2517                            NULL, NULL, NULL,
2518                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2519                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2520   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2521   /* ask bufferSize bytes for external memory */
2522   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2523                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2524                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2525                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2526   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2527   /* inspect the matrices A and B to understand the memory requirement for the next step */
2528   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2529                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2530                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2531                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2532   /* ask bufferSize again bytes for external memory */
2533   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2534                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2535                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2536                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2537   /* The CUSPARSE documentation is not clear, nor the API
2538      We need both buffers to perform the operations properly!
2539      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2540      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2541      is stored in the descriptor! What a messy API... */
2542   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2543   /* compute the intermediate product of A * B */
2544   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2545                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2546                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2547                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2548   /* get matrix C non-zero entries C_nnz1 */
2549   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2550   c->nz = (PetscInt) C_nnz1;
2551   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2552   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2553   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2554   Ccsr->values = new THRUSTARRAY(c->nz);
2555   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2556   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2557                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2558   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2559                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2560                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2561 #else
2562   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2563   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2564                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2565                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2566                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2567                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2568   c->nz = cnz;
2569   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2570   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571   Ccsr->values = new THRUSTARRAY(c->nz);
2572   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2573 
2574   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2575   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2576      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2577      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2578   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2579                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2580                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2581                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2582                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2583 #endif
2584   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2585   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2586   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2587 finalizesym:
2588   c->singlemalloc = PETSC_FALSE;
2589   c->free_a       = PETSC_TRUE;
2590   c->free_ij      = PETSC_TRUE;
2591   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2592   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2593   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2594     PetscInt *d_i = c->i;
2595     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2596     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2597     ii   = *Ccsr->row_offsets;
2598     jj   = *Ccsr->column_indices;
2599     if (ciscompressed) d_i = c->compressedrow.i;
2600     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2601     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2602   } else {
2603     PetscInt *d_i = c->i;
2604     if (ciscompressed) d_i = c->compressedrow.i;
2605     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2606     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2607   }
2608   if (ciscompressed) { /* need to expand host row offsets */
2609     PetscInt r = 0;
2610     c->i[0] = 0;
2611     for (k = 0; k < c->compressedrow.nrows; k++) {
2612       const PetscInt next = c->compressedrow.rindex[k];
2613       const PetscInt old = c->compressedrow.i[k];
2614       for (; r < next; r++) c->i[r+1] = old;
2615     }
2616     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2617   }
2618   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2619   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2620   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2621   c->maxnz = c->nz;
2622   c->nonzerorowcnt = 0;
2623   c->rmax = 0;
2624   for (k = 0; k < m; k++) {
2625     const PetscInt nn = c->i[k+1] - c->i[k];
2626     c->ilen[k] = c->imax[k] = nn;
2627     c->nonzerorowcnt += (PetscInt)!!nn;
2628     c->rmax = PetscMax(c->rmax,nn);
2629   }
2630   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2631   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2632   Ccsr->num_entries = c->nz;
2633 
2634   C->nonzerostate++;
2635   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2636   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2637   Ccusp->nonzerostate = C->nonzerostate;
2638   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2639   C->preallocated  = PETSC_TRUE;
2640   C->assembled     = PETSC_FALSE;
2641   C->was_assembled = PETSC_FALSE;
2642   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2643     mmdata->reusesym = PETSC_TRUE;
2644     C->offloadmask   = PETSC_OFFLOAD_GPU;
2645   }
2646   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2647   PetscFunctionReturn(0);
2648 }
2649 
2650 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2651 
2652 /* handles sparse or dense B */
2653 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2654 {
2655   Mat_Product    *product = mat->product;
2656   PetscErrorCode ierr;
2657   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2658 
2659   PetscFunctionBegin;
2660   MatCheckProduct(mat,1);
2661   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2662   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2663     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2664   }
2665   if (product->type == MATPRODUCT_ABC) {
2666     Ciscusp = PETSC_FALSE;
2667     if (!product->C->boundtocpu) {
2668       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2669     }
2670   }
2671   if (isdense) {
2672     switch (product->type) {
2673     case MATPRODUCT_AB:
2674     case MATPRODUCT_AtB:
2675     case MATPRODUCT_ABt:
2676     case MATPRODUCT_PtAP:
2677     case MATPRODUCT_RARt:
2678      if (product->A->boundtocpu) {
2679         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2680       } else {
2681         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2682       }
2683       break;
2684     case MATPRODUCT_ABC:
2685       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2686       break;
2687     default:
2688       break;
2689     }
2690   } else if (Biscusp && Ciscusp) {
2691     switch (product->type) {
2692     case MATPRODUCT_AB:
2693     case MATPRODUCT_AtB:
2694     case MATPRODUCT_ABt:
2695       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2696       break;
2697     case MATPRODUCT_PtAP:
2698     case MATPRODUCT_RARt:
2699     case MATPRODUCT_ABC:
2700       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2701       break;
2702     default:
2703       break;
2704     }
2705   } else { /* fallback for AIJ */
2706     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2707   }
2708   PetscFunctionReturn(0);
2709 }
2710 
2711 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2712 {
2713   PetscErrorCode ierr;
2714 
2715   PetscFunctionBegin;
2716   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2717   PetscFunctionReturn(0);
2718 }
2719 
2720 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2721 {
2722   PetscErrorCode ierr;
2723 
2724   PetscFunctionBegin;
2725   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2726   PetscFunctionReturn(0);
2727 }
2728 
2729 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2730 {
2731   PetscErrorCode ierr;
2732 
2733   PetscFunctionBegin;
2734   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2735   PetscFunctionReturn(0);
2736 }
2737 
2738 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2739 {
2740   PetscErrorCode ierr;
2741 
2742   PetscFunctionBegin;
2743   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2744   PetscFunctionReturn(0);
2745 }
2746 
2747 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2748 {
2749   PetscErrorCode ierr;
2750 
2751   PetscFunctionBegin;
2752   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2753   PetscFunctionReturn(0);
2754 }
2755 
2756 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2757 {
2758   int i = blockIdx.x*blockDim.x + threadIdx.x;
2759   if (i < n) y[idx[i]] += x[i];
2760 }
2761 
2762 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2763 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2764 {
2765   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2766   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2767   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2768   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2769   PetscErrorCode               ierr;
2770   cudaError_t                  cerr;
2771   cusparseStatus_t             stat;
2772   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2773   PetscBool                    compressed;
2774 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2775   PetscInt                     nx,ny;
2776 #endif
2777 
2778   PetscFunctionBegin;
2779   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2780   if (!a->nonzerorowcnt) {
2781     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2782     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2783     PetscFunctionReturn(0);
2784   }
2785   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2786   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2787   if (!trans) {
2788     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2789     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2790   } else {
2791     if (herm || !cusparsestruct->transgen) {
2792       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2793       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2794     } else {
2795       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2796       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2797     }
2798   }
2799   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2800   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2801 
2802   try {
2803     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2804     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2805     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2806 
2807     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2808     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2809       /* z = A x + beta y.
2810          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2811          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2812       */
2813       xptr = xarray;
2814       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2815       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2816      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2817       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2818           allocated to accommodate different uses. So we get the length info directly from mat.
2819        */
2820       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2821         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2822         nx = mat->num_cols;
2823         ny = mat->num_rows;
2824       }
2825      #endif
2826     } else {
2827       /* z = A^T x + beta y
2828          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2829          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2830        */
2831       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2832       dptr = zarray;
2833       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2834       if (compressed) { /* Scatter x to work vector */
2835         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2836         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2837                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2838                          VecCUDAEqualsReverse());
2839       }
2840      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2841       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2842         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2843         nx = mat->num_rows;
2844         ny = mat->num_cols;
2845       }
2846      #endif
2847     }
2848 
2849     /* csr_spmv does y = alpha op(A) x + beta y */
2850     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2851      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2852       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2853       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2854         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2855         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2856         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2857                                 matstruct->matDescr,
2858                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2859                                 matstruct->cuSpMV[opA].vecYDescr,
2860                                 cusparse_scalartype,
2861                                 cusparsestruct->spmvAlg,
2862                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2863         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2864 
2865         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2866       } else {
2867         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2868         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2869         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2870       }
2871 
2872       stat = cusparseSpMV(cusparsestruct->handle, opA,
2873                                matstruct->alpha_one,
2874                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2875                                matstruct->cuSpMV[opA].vecXDescr,
2876                                beta,
2877                                matstruct->cuSpMV[opA].vecYDescr,
2878                                cusparse_scalartype,
2879                                cusparsestruct->spmvAlg,
2880                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2881      #else
2882       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2883       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2884                                mat->num_rows, mat->num_cols,
2885                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2886                                mat->values->data().get(), mat->row_offsets->data().get(),
2887                                mat->column_indices->data().get(), xptr, beta,
2888                                dptr);CHKERRCUSPARSE(stat);
2889      #endif
2890     } else {
2891       if (cusparsestruct->nrows) {
2892        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2893         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2894        #else
2895         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2896         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2897                                  matstruct->alpha_one, matstruct->descr, hybMat,
2898                                  xptr, beta,
2899                                  dptr);CHKERRCUSPARSE(stat);
2900        #endif
2901       }
2902     }
2903     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2904     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2905 
2906     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2907       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2908         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2909           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2910         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2911           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2912         }
2913       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2914         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2915       }
2916 
2917       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2918       if (compressed) {
2919         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2920         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2921            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2922            prevent that. So I just add a ScatterAdd kernel.
2923          */
2924        #if 0
2925         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2926         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2927                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2928                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2929                          VecCUDAPlusEquals());
2930        #else
2931         PetscInt n = matstruct->cprowIndices->size();
2932         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2933        #endif
2934         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2935         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2936       }
2937     } else {
2938       if (yy && yy != zz) {
2939         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2940       }
2941     }
2942     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2943     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2944     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2945   } catch(char *ex) {
2946     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2947   }
2948   if (yy) {
2949     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2950   } else {
2951     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2952   }
2953   PetscFunctionReturn(0);
2954 }
2955 
2956 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2957 {
2958   PetscErrorCode ierr;
2959 
2960   PetscFunctionBegin;
2961   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2962   PetscFunctionReturn(0);
2963 }
2964 
2965 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2966 {
2967   PetscErrorCode              ierr;
2968   PetscSplitCSRDataStructure  *d_mat = NULL;
2969   PetscFunctionBegin;
2970   if (A->factortype == MAT_FACTOR_NONE) {
2971     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2972   }
2973   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2974   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2975   if (d_mat) {
2976     A->offloadmask = PETSC_OFFLOAD_GPU;
2977   }
2978 
2979   PetscFunctionReturn(0);
2980 }
2981 
2982 /* --------------------------------------------------------------------------------*/
2983 /*@
2984    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2985    (the default parallel PETSc format). This matrix will ultimately pushed down
2986    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2987    assembly performance the user should preallocate the matrix storage by setting
2988    the parameter nz (or the array nnz).  By setting these parameters accurately,
2989    performance during matrix assembly can be increased by more than a factor of 50.
2990 
2991    Collective
2992 
2993    Input Parameters:
2994 +  comm - MPI communicator, set to PETSC_COMM_SELF
2995 .  m - number of rows
2996 .  n - number of columns
2997 .  nz - number of nonzeros per row (same for all rows)
2998 -  nnz - array containing the number of nonzeros in the various rows
2999          (possibly different for each row) or NULL
3000 
3001    Output Parameter:
3002 .  A - the matrix
3003 
3004    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3005    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3006    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3007 
3008    Notes:
3009    If nnz is given then nz is ignored
3010 
3011    The AIJ format (also called the Yale sparse matrix format or
3012    compressed row storage), is fully compatible with standard Fortran 77
3013    storage.  That is, the stored row and column indices can begin at
3014    either one (as in Fortran) or zero.  See the users' manual for details.
3015 
3016    Specify the preallocated storage with either nz or nnz (not both).
3017    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3018    allocation.  For large problems you MUST preallocate memory or you
3019    will get TERRIBLE performance, see the users' manual chapter on matrices.
3020 
3021    By default, this format uses inodes (identical nodes) when possible, to
3022    improve numerical efficiency of matrix-vector products and solves. We
3023    search for consecutive rows with the same nonzero structure, thereby
3024    reusing matrix information to achieve increased efficiency.
3025 
3026    Level: intermediate
3027 
3028 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3029 @*/
3030 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3031 {
3032   PetscErrorCode ierr;
3033 
3034   PetscFunctionBegin;
3035   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3036   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3037   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3038   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3039   PetscFunctionReturn(0);
3040 }
3041 
3042 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3043 {
3044   PetscErrorCode              ierr;
3045   PetscSplitCSRDataStructure  *d_mat = NULL;
3046 
3047   PetscFunctionBegin;
3048   if (A->factortype == MAT_FACTOR_NONE) {
3049     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3050     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3051     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3052   } else {
3053     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3054   }
3055   if (d_mat) {
3056     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3057     cudaError_t                err;
3058     PetscSplitCSRDataStructure h_mat;
3059     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3060     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3061     if (a->compressedrow.use) {
3062       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3063     }
3064     err = cudaFree(d_mat);CHKERRCUDA(err);
3065   }
3066   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3067   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3068   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3069   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3070   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3071   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3072   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3073   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3074   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3075   PetscFunctionReturn(0);
3076 }
3077 
3078 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3079 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3080 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3081 {
3082   PetscErrorCode ierr;
3083 
3084   PetscFunctionBegin;
3085   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3086   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3087   PetscFunctionReturn(0);
3088 }
3089 
3090 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3091 {
3092   PetscErrorCode     ierr;
3093   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3094   Mat_SeqAIJCUSPARSE *cy;
3095   Mat_SeqAIJCUSPARSE *cx;
3096   PetscScalar        *ay;
3097   const PetscScalar  *ax;
3098   CsrMatrix          *csry,*csrx;
3099   cudaError_t        cerr;
3100 
3101   PetscFunctionBegin;
3102   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3103   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3104   if (X->ops->axpy != Y->ops->axpy) {
3105     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3106     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3107     PetscFunctionReturn(0);
3108   }
3109   /* if we are here, it means both matrices are bound to GPU */
3110   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3111   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3112   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3113   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3114   csry = (CsrMatrix*)cy->mat->mat;
3115   csrx = (CsrMatrix*)cx->mat->mat;
3116   /* see if we can turn this into a cublas axpy */
3117   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3118     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3119     if (eq) {
3120       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3121     }
3122     if (eq) str = SAME_NONZERO_PATTERN;
3123   }
3124   /* spgeam is buggy with one column */
3125   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3126 
3127   if (str == SUBSET_NONZERO_PATTERN) {
3128     cusparseStatus_t stat;
3129     PetscScalar      b = 1.0;
3130 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3131     size_t           bufferSize;
3132     void             *buffer;
3133 #endif
3134 
3135     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3136     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3137     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3139     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3140                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3141                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3142                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3143     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3144     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3145     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3146                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3147                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3148                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3149     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3150     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3151     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3152     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3153 #else
3154     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3155     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3156                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3157                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3158                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3159     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3160     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3161     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3162 #endif
3163     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3164     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3165     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3166     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3167   } else if (str == SAME_NONZERO_PATTERN) {
3168     cublasHandle_t cublasv2handle;
3169     cublasStatus_t berr;
3170     PetscBLASInt   one = 1, bnz = 1;
3171 
3172     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3173     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3174     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3175     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3176     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3177     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3178     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3179     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3180     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3181     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3182     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3183     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3184   } else {
3185     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3186     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3187   }
3188   PetscFunctionReturn(0);
3189 }
3190 
3191 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3192 {
3193   PetscErrorCode ierr;
3194   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3195   PetscScalar    *ay;
3196   cudaError_t    cerr;
3197   cublasHandle_t cublasv2handle;
3198   cublasStatus_t berr;
3199   PetscBLASInt   one = 1, bnz = 1;
3200 
3201   PetscFunctionBegin;
3202   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3203   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3204   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3205   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3206   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3207   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3208   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3209   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3210   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3211   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3212   PetscFunctionReturn(0);
3213 }
3214 
3215 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3216 {
3217   PetscErrorCode             ierr;
3218   PetscBool                  both = PETSC_FALSE;
3219   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3220 
3221   PetscFunctionBegin;
3222   if (A->factortype == MAT_FACTOR_NONE) {
3223     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3224     if (spptr->mat) {
3225       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3226       if (matrix->values) {
3227         both = PETSC_TRUE;
3228         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3229       }
3230     }
3231     if (spptr->matTranspose) {
3232       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3233       if (matrix->values) {
3234         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3235       }
3236     }
3237   }
3238   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3239   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3240   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3241   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3242   else A->offloadmask = PETSC_OFFLOAD_CPU;
3243 
3244   PetscFunctionReturn(0);
3245 }
3246 
3247 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3248 {
3249   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3250   PetscErrorCode ierr;
3251 
3252   PetscFunctionBegin;
3253   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3254   if (flg) {
3255     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3256 
3257     A->ops->scale                     = MatScale_SeqAIJ;
3258     A->ops->axpy                      = MatAXPY_SeqAIJ;
3259     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3260     A->ops->mult                      = MatMult_SeqAIJ;
3261     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3262     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3263     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3264     A->ops->multhermitiantranspose    = NULL;
3265     A->ops->multhermitiantransposeadd = NULL;
3266     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3267     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3268     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3269     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3270     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3271     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3272     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3273     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3274   } else {
3275     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3276     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3277     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3278     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3279     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3280     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3281     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3282     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3283     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3284     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3285     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3286     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3287     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3288     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3289     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3290     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3291     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3292   }
3293   A->boundtocpu = flg;
3294   a->inode.use = flg;
3295   PetscFunctionReturn(0);
3296 }
3297 
3298 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3299 {
3300   PetscErrorCode   ierr;
3301   cusparseStatus_t stat;
3302   Mat              B;
3303 
3304   PetscFunctionBegin;
3305   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3306   if (reuse == MAT_INITIAL_MATRIX) {
3307     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3308   } else if (reuse == MAT_REUSE_MATRIX) {
3309     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3310   }
3311   B = *newmat;
3312 
3313   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3314   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3315 
3316   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3317     if (B->factortype == MAT_FACTOR_NONE) {
3318       Mat_SeqAIJCUSPARSE *spptr;
3319 
3320       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3321       spptr->format = MAT_CUSPARSE_CSR;
3322       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3323       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3324       B->spptr = spptr;
3325       spptr->deviceMat = NULL;
3326     } else {
3327       Mat_SeqAIJCUSPARSETriFactors *spptr;
3328 
3329       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3330       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3331       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3332       B->spptr = spptr;
3333     }
3334     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3335   }
3336   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3337   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3338   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3339   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3340   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3341 
3342   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3343   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3344   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3345   PetscFunctionReturn(0);
3346 }
3347 
3348 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3349 {
3350   PetscErrorCode ierr;
3351 
3352   PetscFunctionBegin;
3353   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3354   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3355   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3356   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3357   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 /*MC
3362    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3363 
3364    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3365    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3366    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3367 
3368    Options Database Keys:
3369 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3370 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3371 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3372 
3373   Level: beginner
3374 
3375 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3376 M*/
3377 
3378 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3379 
3380 
3381 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3382 {
3383   PetscErrorCode ierr;
3384 
3385   PetscFunctionBegin;
3386   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3387   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3388   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3389   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3390   PetscFunctionReturn(0);
3391 }
3392 
3393 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3394 {
3395   PetscErrorCode   ierr;
3396   cusparseStatus_t stat;
3397 
3398   PetscFunctionBegin;
3399   if (*cusparsestruct) {
3400     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3401     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3402     delete (*cusparsestruct)->workVector;
3403     delete (*cusparsestruct)->rowoffsets_gpu;
3404     delete (*cusparsestruct)->cooPerm;
3405     delete (*cusparsestruct)->cooPerm_a;
3406     delete (*cusparsestruct)->csr2csc_i;
3407     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3408     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3409   }
3410   PetscFunctionReturn(0);
3411 }
3412 
3413 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3414 {
3415   PetscFunctionBegin;
3416   if (*mat) {
3417     delete (*mat)->values;
3418     delete (*mat)->column_indices;
3419     delete (*mat)->row_offsets;
3420     delete *mat;
3421     *mat = 0;
3422   }
3423   PetscFunctionReturn(0);
3424 }
3425 
3426 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3427 {
3428   cusparseStatus_t stat;
3429   PetscErrorCode   ierr;
3430 
3431   PetscFunctionBegin;
3432   if (*trifactor) {
3433     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3434     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3435     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3436     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3437     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3438    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3439     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3440    #endif
3441     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3442   }
3443   PetscFunctionReturn(0);
3444 }
3445 
3446 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3447 {
3448   CsrMatrix        *mat;
3449   cusparseStatus_t stat;
3450   cudaError_t      err;
3451 
3452   PetscFunctionBegin;
3453   if (*matstruct) {
3454     if ((*matstruct)->mat) {
3455       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3456        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3457         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3458        #else
3459         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3460         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3461        #endif
3462       } else {
3463         mat = (CsrMatrix*)(*matstruct)->mat;
3464         CsrMatrix_Destroy(&mat);
3465       }
3466     }
3467     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3468     delete (*matstruct)->cprowIndices;
3469     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3470     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3471     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3472 
3473    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3474     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3475     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3476     for (int i=0; i<3; i++) {
3477       if (mdata->cuSpMV[i].initialized) {
3478         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3479         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3480         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3481       }
3482     }
3483    #endif
3484     delete *matstruct;
3485     *matstruct = NULL;
3486   }
3487   PetscFunctionReturn(0);
3488 }
3489 
3490 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3491 {
3492   PetscErrorCode ierr;
3493 
3494   PetscFunctionBegin;
3495   if (*trifactors) {
3496     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3497     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3498     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3499     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3500     delete (*trifactors)->rpermIndices;
3501     delete (*trifactors)->cpermIndices;
3502     delete (*trifactors)->workVector;
3503     (*trifactors)->rpermIndices = NULL;
3504     (*trifactors)->cpermIndices = NULL;
3505     (*trifactors)->workVector = NULL;
3506   }
3507   PetscFunctionReturn(0);
3508 }
3509 
3510 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3511 {
3512   PetscErrorCode   ierr;
3513   cusparseHandle_t handle;
3514   cusparseStatus_t stat;
3515 
3516   PetscFunctionBegin;
3517   if (*trifactors) {
3518     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3519     if (handle = (*trifactors)->handle) {
3520       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3521     }
3522     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3523   }
3524   PetscFunctionReturn(0);
3525 }
3526 
3527 struct IJCompare
3528 {
3529   __host__ __device__
3530   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3531   {
3532     if (t1.get<0>() < t2.get<0>()) return true;
3533     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3534     return false;
3535   }
3536 };
3537 
3538 struct IJEqual
3539 {
3540   __host__ __device__
3541   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3542   {
3543     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3544     return true;
3545   }
3546 };
3547 
3548 struct IJDiff
3549 {
3550   __host__ __device__
3551   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3552   {
3553     return t1 == t2 ? 0 : 1;
3554   }
3555 };
3556 
3557 struct IJSum
3558 {
3559   __host__ __device__
3560   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3561   {
3562     return t1||t2;
3563   }
3564 };
3565 
3566 #include <thrust/iterator/discard_iterator.h>
3567 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3568 {
3569   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3570   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3571   THRUSTARRAY                           *cooPerm_v = NULL;
3572   thrust::device_ptr<const PetscScalar> d_v;
3573   CsrMatrix                             *matrix;
3574   PetscErrorCode                        ierr;
3575   cudaError_t                           cerr;
3576   PetscInt                              n;
3577 
3578   PetscFunctionBegin;
3579   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3580   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3581   if (!cusp->cooPerm) {
3582     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3583     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3584     PetscFunctionReturn(0);
3585   }
3586   matrix = (CsrMatrix*)cusp->mat->mat;
3587   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3588   if (!v) {
3589     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3590     goto finalize;
3591   }
3592   n = cusp->cooPerm->size();
3593   if (isCudaMem(v)) {
3594     d_v = thrust::device_pointer_cast(v);
3595   } else {
3596     cooPerm_v = new THRUSTARRAY(n);
3597     cooPerm_v->assign(v,v+n);
3598     d_v = cooPerm_v->data();
3599     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3600   }
3601   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3602   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3603     if (cusp->cooPerm_a) {
3604       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3605       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3606       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3607       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3608       delete cooPerm_w;
3609     } else {
3610       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3611                                                                 matrix->values->begin()));
3612       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3613                                                                 matrix->values->end()));
3614       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3615     }
3616   } else {
3617     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3618       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3619       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3620     } else {
3621       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3622                                                                 matrix->values->begin()));
3623       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3624                                                                 matrix->values->end()));
3625       thrust::for_each(zibit,zieit,VecCUDAEquals());
3626     }
3627   }
3628   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3629   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3630 finalize:
3631   delete cooPerm_v;
3632   A->offloadmask = PETSC_OFFLOAD_GPU;
3633   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3634   /* shorter version of MatAssemblyEnd_SeqAIJ */
3635   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3636   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3637   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3638   a->reallocs         = 0;
3639   A->info.mallocs    += 0;
3640   A->info.nz_unneeded = 0;
3641   A->assembled = A->was_assembled = PETSC_TRUE;
3642   A->num_ass++;
3643   PetscFunctionReturn(0);
3644 }
3645 
3646 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3647 {
3648   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3649   PetscErrorCode     ierr;
3650 
3651   PetscFunctionBegin;
3652   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3653   if (!cusp) PetscFunctionReturn(0);
3654   if (destroy) {
3655     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3656     delete cusp->csr2csc_i;
3657     cusp->csr2csc_i = NULL;
3658   }
3659   cusp->transupdated = PETSC_FALSE;
3660   PetscFunctionReturn(0);
3661 }
3662 
3663 #include <thrust/binary_search.h>
3664 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3665 {
3666   PetscErrorCode     ierr;
3667   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3668   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3669   PetscInt           cooPerm_n, nzr = 0;
3670   cudaError_t        cerr;
3671 
3672   PetscFunctionBegin;
3673   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3674   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3675   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3676   if (n != cooPerm_n) {
3677     delete cusp->cooPerm;
3678     delete cusp->cooPerm_a;
3679     cusp->cooPerm = NULL;
3680     cusp->cooPerm_a = NULL;
3681   }
3682   if (n) {
3683     THRUSTINTARRAY d_i(n);
3684     THRUSTINTARRAY d_j(n);
3685     THRUSTINTARRAY ii(A->rmap->n);
3686 
3687     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3688     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3689 
3690     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3691     d_i.assign(coo_i,coo_i+n);
3692     d_j.assign(coo_j,coo_j+n);
3693     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3694     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3695 
3696     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3697     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3698     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3699     *cusp->cooPerm_a = d_i;
3700     THRUSTINTARRAY w = d_j;
3701 
3702     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3703     if (nekey == ekey) { /* all entries are unique */
3704       delete cusp->cooPerm_a;
3705       cusp->cooPerm_a = NULL;
3706     } else { /* I couldn't come up with a more elegant algorithm */
3707       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3708       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3709       (*cusp->cooPerm_a)[0] = 0;
3710       w[0] = 0;
3711       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3712       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3713     }
3714     thrust::counting_iterator<PetscInt> search_begin(0);
3715     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3716                         search_begin, search_begin + A->rmap->n,
3717                         ii.begin());
3718     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3719     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3720 
3721     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3722     a->singlemalloc = PETSC_FALSE;
3723     a->free_a       = PETSC_TRUE;
3724     a->free_ij      = PETSC_TRUE;
3725     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3726     a->i[0] = 0;
3727     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3728     a->nz = a->maxnz = a->i[A->rmap->n];
3729     a->rmax = 0;
3730     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3731     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3732     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3733     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3734     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3735     for (PetscInt i = 0; i < A->rmap->n; i++) {
3736       const PetscInt nnzr = a->i[i+1] - a->i[i];
3737       nzr += (PetscInt)!!(nnzr);
3738       a->ilen[i] = a->imax[i] = nnzr;
3739       a->rmax = PetscMax(a->rmax,nnzr);
3740     }
3741     a->nonzerorowcnt = nzr;
3742     A->preallocated = PETSC_TRUE;
3743     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3744     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3745   } else {
3746     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3747   }
3748   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3749 
3750   /* We want to allocate the CUSPARSE struct for matvec now.
3751      The code is so convoluted now that I prefer to copy zeros */
3752   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3753   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3754   A->offloadmask = PETSC_OFFLOAD_CPU;
3755   A->nonzerostate++;
3756   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3757   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3758 
3759   A->assembled = PETSC_FALSE;
3760   A->was_assembled = PETSC_FALSE;
3761   PetscFunctionReturn(0);
3762 }
3763 
3764 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3765 {
3766   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3767   CsrMatrix          *csr;
3768   PetscErrorCode     ierr;
3769 
3770   PetscFunctionBegin;
3771   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3772   PetscValidPointer(a,2);
3773   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3774   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3775   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3776   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3777   csr = (CsrMatrix*)cusp->mat->mat;
3778   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3779   *a = csr->values->data().get();
3780   PetscFunctionReturn(0);
3781 }
3782 
3783 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3784 {
3785   PetscFunctionBegin;
3786   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3787   PetscValidPointer(a,2);
3788   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3789   *a = NULL;
3790   PetscFunctionReturn(0);
3791 }
3792 
3793 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3794 {
3795   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3796   CsrMatrix          *csr;
3797   PetscErrorCode     ierr;
3798 
3799   PetscFunctionBegin;
3800   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801   PetscValidPointer(a,2);
3802   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3803   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3804   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3805   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3806   csr = (CsrMatrix*)cusp->mat->mat;
3807   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3808   *a = csr->values->data().get();
3809   A->offloadmask = PETSC_OFFLOAD_GPU;
3810   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3811   PetscFunctionReturn(0);
3812 }
3813 
3814 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3815 {
3816   PetscErrorCode ierr;
3817 
3818   PetscFunctionBegin;
3819   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3820   PetscValidPointer(a,2);
3821   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3822   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3823   *a = NULL;
3824   PetscFunctionReturn(0);
3825 }
3826 
3827 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3828 {
3829   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3830   CsrMatrix          *csr;
3831   PetscErrorCode     ierr;
3832 
3833   PetscFunctionBegin;
3834   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3835   PetscValidPointer(a,2);
3836   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3837   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3838   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3839   csr = (CsrMatrix*)cusp->mat->mat;
3840   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3841   *a = csr->values->data().get();
3842   A->offloadmask = PETSC_OFFLOAD_GPU;
3843   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3844   PetscFunctionReturn(0);
3845 }
3846 
3847 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3848 {
3849   PetscErrorCode ierr;
3850 
3851   PetscFunctionBegin;
3852   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3853   PetscValidPointer(a,2);
3854   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3855   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3856   *a = NULL;
3857   PetscFunctionReturn(0);
3858 }
3859 
3860 struct IJCompare4
3861 {
3862   __host__ __device__
3863   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3864   {
3865     if (t1.get<0>() < t2.get<0>()) return true;
3866     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3867     return false;
3868   }
3869 };
3870 
3871 struct Shift
3872 {
3873   int _shift;
3874 
3875   Shift(int shift) : _shift(shift) {}
3876   __host__ __device__
3877   inline int operator() (const int &c)
3878   {
3879     return c + _shift;
3880   }
3881 };
3882 
3883 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3884 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3885 {
3886   PetscErrorCode               ierr;
3887   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3888   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3889   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3890   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3891   PetscInt                     Annz,Bnnz;
3892   cusparseStatus_t             stat;
3893   PetscInt                     i,m,n,zero = 0;
3894   cudaError_t                  cerr;
3895 
3896   PetscFunctionBegin;
3897   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3898   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3899   PetscValidPointer(C,4);
3900   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3901   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3902   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3903   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3904   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3905   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3906   if (reuse == MAT_INITIAL_MATRIX) {
3907     m     = A->rmap->n;
3908     n     = A->cmap->n + B->cmap->n;
3909     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3910     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3911     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3912     c     = (Mat_SeqAIJ*)(*C)->data;
3913     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3914     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3915     Ccsr  = new CsrMatrix;
3916     Cmat->cprowIndices      = NULL;
3917     c->compressedrow.use    = PETSC_FALSE;
3918     c->compressedrow.nrows  = 0;
3919     c->compressedrow.i      = NULL;
3920     c->compressedrow.rindex = NULL;
3921     Ccusp->workVector       = NULL;
3922     Ccusp->nrows    = m;
3923     Ccusp->mat      = Cmat;
3924     Ccusp->mat->mat = Ccsr;
3925     Ccsr->num_rows  = m;
3926     Ccsr->num_cols  = n;
3927     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3928     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3929     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3930     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3931     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3932     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3933     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3934     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3935     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3937     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3938     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3939     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3940     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3941     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3942 
3943     Acsr = (CsrMatrix*)Acusp->mat->mat;
3944     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3945     Annz = (PetscInt)Acsr->column_indices->size();
3946     Bnnz = (PetscInt)Bcsr->column_indices->size();
3947     c->nz = Annz + Bnnz;
3948     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3949     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3950     Ccsr->values = new THRUSTARRAY(c->nz);
3951     Ccsr->num_entries = c->nz;
3952     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3953     if (c->nz) {
3954       auto Acoo = new THRUSTINTARRAY32(Annz);
3955       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3956       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3957       THRUSTINTARRAY32 *Aroff,*Broff;
3958 
3959       if (a->compressedrow.use) { /* need full row offset */
3960         if (!Acusp->rowoffsets_gpu) {
3961           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3962           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3963           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3964         }
3965         Aroff = Acusp->rowoffsets_gpu;
3966       } else Aroff = Acsr->row_offsets;
3967       if (b->compressedrow.use) { /* need full row offset */
3968         if (!Bcusp->rowoffsets_gpu) {
3969           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3970           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3971           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3972         }
3973         Broff = Bcusp->rowoffsets_gpu;
3974       } else Broff = Bcsr->row_offsets;
3975       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3976       stat = cusparseXcsr2coo(Acusp->handle,
3977                               Aroff->data().get(),
3978                               Annz,
3979                               m,
3980                               Acoo->data().get(),
3981                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3982       stat = cusparseXcsr2coo(Bcusp->handle,
3983                               Broff->data().get(),
3984                               Bnnz,
3985                               m,
3986                               Bcoo->data().get(),
3987                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3988       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3989       auto Aperm = thrust::make_constant_iterator(1);
3990       auto Bperm = thrust::make_constant_iterator(0);
3991 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3992       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3993       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3994 #else
3995       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3996       auto Bcib = Bcsr->column_indices->begin();
3997       auto Bcie = Bcsr->column_indices->end();
3998       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3999 #endif
4000       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4001       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4002       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4003       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4004       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4005       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4006       auto p1 = Ccusp->cooPerm->begin();
4007       auto p2 = Ccusp->cooPerm->begin();
4008       thrust::advance(p2,Annz);
4009       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4010 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4011       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4012 #endif
4013       auto cci = thrust::make_counting_iterator(zero);
4014       auto cce = thrust::make_counting_iterator(c->nz);
4015 #if 0 //Errors on SUMMIT cuda 11.1.0
4016       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4017 #else
4018       auto pred = thrust::identity<int>();
4019       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4020       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4021 #endif
4022       stat = cusparseXcoo2csr(Ccusp->handle,
4023                               Ccoo->data().get(),
4024                               c->nz,
4025                               m,
4026                               Ccsr->row_offsets->data().get(),
4027                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4028       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4029       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4030       delete wPerm;
4031       delete Acoo;
4032       delete Bcoo;
4033       delete Ccoo;
4034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4035       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4036                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4037                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4038                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4039 #endif
4040       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
4041         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4042         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4043         CsrMatrix *CcsrT = new CsrMatrix;
4044         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4045         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4046 
4047         Ccusp->transgen = PETSC_TRUE;
4048         Ccusp->transupdated = PETSC_TRUE;
4049         Ccusp->rowoffsets_gpu = NULL;
4050         CmatT->cprowIndices = NULL;
4051         CmatT->mat = CcsrT;
4052         CcsrT->num_rows = n;
4053         CcsrT->num_cols = m;
4054         CcsrT->num_entries = c->nz;
4055 
4056         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4057         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4058         CcsrT->values = new THRUSTARRAY(c->nz);
4059 
4060         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4061         auto rT = CcsrT->row_offsets->begin();
4062         if (AT) {
4063           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4064           thrust::advance(rT,-1);
4065         }
4066         if (BT) {
4067           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4068           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4069           thrust::copy(titb,tite,rT);
4070         }
4071         auto cT = CcsrT->column_indices->begin();
4072         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4073         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4074         auto vT = CcsrT->values->begin();
4075         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4076         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4077         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4078         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4079 
4080         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4081         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4082         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4083         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4084         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4085         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4086         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4087         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4088         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4090         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4091                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4092                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4093                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4094 #endif
4095         Ccusp->matTranspose = CmatT;
4096       }
4097     }
4098 
4099     c->singlemalloc = PETSC_FALSE;
4100     c->free_a       = PETSC_TRUE;
4101     c->free_ij      = PETSC_TRUE;
4102     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4103     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4104     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4105       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4106       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4107       ii   = *Ccsr->row_offsets;
4108       jj   = *Ccsr->column_indices;
4109       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4110       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4111     } else {
4112       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4113       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4114     }
4115     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4116     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4117     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4118     c->maxnz = c->nz;
4119     c->nonzerorowcnt = 0;
4120     c->rmax = 0;
4121     for (i = 0; i < m; i++) {
4122       const PetscInt nn = c->i[i+1] - c->i[i];
4123       c->ilen[i] = c->imax[i] = nn;
4124       c->nonzerorowcnt += (PetscInt)!!nn;
4125       c->rmax = PetscMax(c->rmax,nn);
4126     }
4127     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4128     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4129     (*C)->nonzerostate++;
4130     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4131     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4132     Ccusp->nonzerostate = (*C)->nonzerostate;
4133     (*C)->preallocated  = PETSC_TRUE;
4134   } else {
4135     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4136     c = (Mat_SeqAIJ*)(*C)->data;
4137     if (c->nz) {
4138       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4139       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4140       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4141       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4142       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4143       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4144       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4145       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4146       Acsr = (CsrMatrix*)Acusp->mat->mat;
4147       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4148       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4149       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4150       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4151       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4152       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4153       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4154       auto pmid = Ccusp->cooPerm->begin();
4155       thrust::advance(pmid,Acsr->num_entries);
4156       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4157       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4158                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4159       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4160                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4161       thrust::for_each(zibait,zieait,VecCUDAEquals());
4162       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4163                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4164       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4165                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4166       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4167       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4168       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4169         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4170         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4171         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4172         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4173         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4174         auto vT = CcsrT->values->begin();
4175         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4176         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4177         Ccusp->transupdated = PETSC_TRUE;
4178       }
4179       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4180       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4181     }
4182   }
4183   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4184   (*C)->assembled     = PETSC_TRUE;
4185   (*C)->was_assembled = PETSC_FALSE;
4186   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4187   PetscFunctionReturn(0);
4188 }
4189 
4190 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4191 {
4192   PetscErrorCode    ierr;
4193   bool              dmem;
4194   const PetscScalar *av;
4195   cudaError_t       cerr;
4196 
4197   PetscFunctionBegin;
4198   dmem = isCudaMem(v);
4199   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4200   if (n && idx) {
4201     THRUSTINTARRAY widx(n);
4202     widx.assign(idx,idx+n);
4203     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4204 
4205     THRUSTARRAY *w = NULL;
4206     thrust::device_ptr<PetscScalar> dv;
4207     if (dmem) {
4208       dv = thrust::device_pointer_cast(v);
4209     } else {
4210       w = new THRUSTARRAY(n);
4211       dv = w->data();
4212     }
4213     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4214 
4215     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4216     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4217     thrust::for_each(zibit,zieit,VecCUDAEquals());
4218     if (w) {
4219       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4220     }
4221     delete w;
4222   } else {
4223     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4224   }
4225   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4226   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4227   PetscFunctionReturn(0);
4228 }
4229