xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision b84f494b715b2473938a122f335ae64b02e9d746)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83 
84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   (*B)->useordering = PETSC_TRUE;
164   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
165 
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
171     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
172     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
173   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
174 
175   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
176   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
177   PetscFunctionReturn(0);
178 }
179 
180 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
181 {
182   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
183 
184   PetscFunctionBegin;
185   switch (op) {
186   case MAT_CUSPARSE_MULT:
187     cusparsestruct->format = format;
188     break;
189   case MAT_CUSPARSE_ALL:
190     cusparsestruct->format = format;
191     break;
192   default:
193     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
194   }
195   PetscFunctionReturn(0);
196 }
197 
198 /*@
199    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
200    operation. Only the MatMult operation can use different GPU storage formats
201    for MPIAIJCUSPARSE matrices.
202    Not Collective
203 
204    Input Parameters:
205 +  A - Matrix of type SEQAIJCUSPARSE
206 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
207 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
208 
209    Output Parameter:
210 
211    Level: intermediate
212 
213 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
214 @*/
215 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
221   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 /*@
226    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
227 
228    Collective on mat
229 
230    Input Parameters:
231 +  A - Matrix of type SEQAIJCUSPARSE
232 -  transgen - the boolean flag
233 
234    Level: intermediate
235 
236 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
237 @*/
238 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
239 {
240   PetscErrorCode ierr;
241   PetscBool      flg;
242 
243   PetscFunctionBegin;
244   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
246   if (flg) {
247     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
248 
249     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
250     cusp->transgen = transgen;
251     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
252       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
253     }
254   }
255   PetscFunctionReturn(0);
256 }
257 
258 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
259 {
260   PetscErrorCode           ierr;
261   MatCUSPARSEStorageFormat format;
262   PetscBool                flg;
263   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
264 
265   PetscFunctionBegin;
266   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
267   if (A->factortype == MAT_FACTOR_NONE) {
268     PetscBool transgen = cusparsestruct->transgen;
269 
270     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
271     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
272 
273     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
274                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
275     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
276 
277     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
278                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
279     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
280    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
281     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
282     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
283                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
284     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
285     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
286 
287     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
288     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
289                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
290     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
291 
292     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
293     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
294                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
295     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
296    #endif
297   }
298   ierr = PetscOptionsTail();CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
303 {
304   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
305   PetscErrorCode               ierr;
306 
307   PetscFunctionBegin;
308   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
309   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
310   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
311   PetscFunctionReturn(0);
312 }
313 
314 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
315 {
316   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
317   PetscErrorCode               ierr;
318 
319   PetscFunctionBegin;
320   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
321   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
322   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
323   PetscFunctionReturn(0);
324 }
325 
326 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
327 {
328   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
329   PetscErrorCode               ierr;
330 
331   PetscFunctionBegin;
332   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
333   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
334   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
339 {
340   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
341   PetscErrorCode               ierr;
342 
343   PetscFunctionBegin;
344   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
345   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
346   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
347   PetscFunctionReturn(0);
348 }
349 
350 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
351 {
352   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
353   PetscInt                          n = A->rmap->n;
354   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
355   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
356   cusparseStatus_t                  stat;
357   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
358   const MatScalar                   *aa = a->a,*v;
359   PetscInt                          *AiLo, *AjLo;
360   PetscInt                          i,nz, nzLower, offset, rowOffset;
361   PetscErrorCode                    ierr;
362   cudaError_t                       cerr;
363 
364   PetscFunctionBegin;
365   if (!n) PetscFunctionReturn(0);
366   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
367     try {
368       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
369       nzLower=n+ai[n]-ai[1];
370       if (!loTriFactor) {
371         PetscScalar                       *AALo;
372 
373         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
374 
375         /* Allocate Space for the lower triangular matrix */
376         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
377         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
378 
379         /* Fill the lower triangular matrix */
380         AiLo[0]  = (PetscInt) 0;
381         AiLo[n]  = nzLower;
382         AjLo[0]  = (PetscInt) 0;
383         AALo[0]  = (MatScalar) 1.0;
384         v        = aa;
385         vi       = aj;
386         offset   = 1;
387         rowOffset= 1;
388         for (i=1; i<n; i++) {
389           nz = ai[i+1] - ai[i];
390           /* additional 1 for the term on the diagonal */
391           AiLo[i]    = rowOffset;
392           rowOffset += nz+1;
393 
394           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
395           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
396 
397           offset      += nz;
398           AjLo[offset] = (PetscInt) i;
399           AALo[offset] = (MatScalar) 1.0;
400           offset      += 1;
401 
402           v  += nz;
403           vi += nz;
404         }
405 
406         /* allocate space for the triangular factor information */
407         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
408         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
409         /* Create the matrix description */
410         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
411         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
412        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
413         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
414        #else
415         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
416        #endif
417         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
418         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
419 
420         /* set the operation */
421         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
422 
423         /* set the matrix */
424         loTriFactor->csrMat = new CsrMatrix;
425         loTriFactor->csrMat->num_rows = n;
426         loTriFactor->csrMat->num_cols = n;
427         loTriFactor->csrMat->num_entries = nzLower;
428 
429         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
430         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
431 
432         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
433         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
434 
435         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
436         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
437 
438         /* Create the solve analysis information */
439         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
440         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
441       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
442         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
443                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
444                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
445                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
446                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
447         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
448       #endif
449 
450         /* perform the solve analysis */
451         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
452                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
453                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
454                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
455                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
456                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
457                                #endif
458 );CHKERRCUSPARSE(stat);
459         cerr = WaitForCUDA();CHKERRCUDA(cerr);
460         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
461 
462         /* assign the pointer */
463         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
464         loTriFactor->AA_h = AALo;
465         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
466         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
467         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
468       } else { /* update values only */
469         if (!loTriFactor->AA_h) {
470           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
471         }
472         /* Fill the lower triangular matrix */
473         loTriFactor->AA_h[0]  = 1.0;
474         v        = aa;
475         vi       = aj;
476         offset   = 1;
477         for (i=1; i<n; i++) {
478           nz = ai[i+1] - ai[i];
479           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
480           offset      += nz;
481           loTriFactor->AA_h[offset] = 1.0;
482           offset      += 1;
483           v  += nz;
484         }
485         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
486         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
487       }
488     } catch(char *ex) {
489       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490     }
491   }
492   PetscFunctionReturn(0);
493 }
494 
495 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
496 {
497   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
498   PetscInt                          n = A->rmap->n;
499   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
500   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
501   cusparseStatus_t                  stat;
502   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
503   const MatScalar                   *aa = a->a,*v;
504   PetscInt                          *AiUp, *AjUp;
505   PetscInt                          i,nz, nzUpper, offset;
506   PetscErrorCode                    ierr;
507   cudaError_t                       cerr;
508 
509   PetscFunctionBegin;
510   if (!n) PetscFunctionReturn(0);
511   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
512     try {
513       /* next, figure out the number of nonzeros in the upper triangular matrix. */
514       nzUpper = adiag[0]-adiag[n];
515       if (!upTriFactor) {
516         PetscScalar *AAUp;
517 
518         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
519 
520         /* Allocate Space for the upper triangular matrix */
521         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
522         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
523 
524         /* Fill the upper triangular matrix */
525         AiUp[0]=(PetscInt) 0;
526         AiUp[n]=nzUpper;
527         offset = nzUpper;
528         for (i=n-1; i>=0; i--) {
529           v  = aa + adiag[i+1] + 1;
530           vi = aj + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           AjUp[offset] = (PetscInt) i;
540           AAUp[offset] = (MatScalar)1./v[nz];
541           AiUp[i]      = AiUp[i+1] - (nz+1);
542 
543           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
544           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
545         }
546 
547         /* allocate space for the triangular factor information */
548         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
549         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
550 
551         /* Create the matrix description */
552         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
553         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
554        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
555         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
556        #else
557         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
558        #endif
559         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
560         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
561 
562         /* set the operation */
563         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
564 
565         /* set the matrix */
566         upTriFactor->csrMat = new CsrMatrix;
567         upTriFactor->csrMat->num_rows = n;
568         upTriFactor->csrMat->num_cols = n;
569         upTriFactor->csrMat->num_entries = nzUpper;
570 
571         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
572         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
573 
574         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
575         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
576 
577         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
578         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
579 
580         /* Create the solve analysis information */
581         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
582         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
583       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
584         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
585                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
588                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
589         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
590       #endif
591 
592         /* perform the solve analysis */
593         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
594                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
595                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
596                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
597                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
598                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
599                                #endif
600 );CHKERRCUSPARSE(stat);
601         cerr = WaitForCUDA();CHKERRCUDA(cerr);
602         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
603 
604         /* assign the pointer */
605         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
606         upTriFactor->AA_h = AAUp;
607         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
608         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
609         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
610       } else {
611         if (!upTriFactor->AA_h) {
612           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
613         }
614         /* Fill the upper triangular matrix */
615         offset = nzUpper;
616         for (i=n-1; i>=0; i--) {
617           v  = aa + adiag[i+1] + 1;
618 
619           /* number of elements NOT on the diagonal */
620           nz = adiag[i] - adiag[i+1]-1;
621 
622           /* decrement the offset */
623           offset -= (nz+1);
624 
625           /* first, set the diagonal elements */
626           upTriFactor->AA_h[offset] = 1./v[nz];
627           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
628         }
629         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
630         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
631       }
632     } catch(char *ex) {
633       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
634     }
635   }
636   PetscFunctionReturn(0);
637 }
638 
639 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
640 {
641   PetscErrorCode               ierr;
642   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
643   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
644   IS                           isrow = a->row,iscol = a->icol;
645   PetscBool                    row_identity,col_identity;
646   PetscInt                     n = A->rmap->n;
647 
648   PetscFunctionBegin;
649   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
650   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
651   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
652 
653   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
654   cusparseTriFactors->nnz=a->nz;
655 
656   A->offloadmask = PETSC_OFFLOAD_BOTH;
657   /* lower triangular indices */
658   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
659   if (!row_identity && !cusparseTriFactors->rpermIndices) {
660     const PetscInt *r;
661 
662     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
663     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
664     cusparseTriFactors->rpermIndices->assign(r, r+n);
665     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
666     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
667   }
668 
669   /* upper triangular indices */
670   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
671   if (!col_identity && !cusparseTriFactors->cpermIndices) {
672     const PetscInt *c;
673 
674     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
675     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676     cusparseTriFactors->cpermIndices->assign(c, c+n);
677     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
678     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
679   }
680   PetscFunctionReturn(0);
681 }
682 
683 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
684 {
685   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
686   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
687   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
688   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
689   cusparseStatus_t                  stat;
690   PetscErrorCode                    ierr;
691   cudaError_t                       cerr;
692   PetscInt                          *AiUp, *AjUp;
693   PetscScalar                       *AAUp;
694   PetscScalar                       *AALo;
695   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
696   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
697   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
698   const MatScalar                   *aa = b->a,*v;
699 
700   PetscFunctionBegin;
701   if (!n) PetscFunctionReturn(0);
702   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
703     try {
704       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
706       if (!upTriFactor && !loTriFactor) {
707         /* Allocate Space for the upper triangular matrix */
708         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
709         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
710 
711         /* Fill the upper triangular matrix */
712         AiUp[0]=(PetscInt) 0;
713         AiUp[n]=nzUpper;
714         offset = 0;
715         for (i=0; i<n; i++) {
716           /* set the pointers */
717           v  = aa + ai[i];
718           vj = aj + ai[i];
719           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
720 
721           /* first, set the diagonal elements */
722           AjUp[offset] = (PetscInt) i;
723           AAUp[offset] = (MatScalar)1.0/v[nz];
724           AiUp[i]      = offset;
725           AALo[offset] = (MatScalar)1.0/v[nz];
726 
727           offset+=1;
728           if (nz>0) {
729             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
730             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
731             for (j=offset; j<offset+nz; j++) {
732               AAUp[j] = -AAUp[j];
733               AALo[j] = AAUp[j]/v[nz];
734             }
735             offset+=nz;
736           }
737         }
738 
739         /* allocate space for the triangular factor information */
740         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
741         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
742 
743         /* Create the matrix description */
744         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
745         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
746        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
748        #else
749         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
750        #endif
751         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
752         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
753 
754         /* set the matrix */
755         upTriFactor->csrMat = new CsrMatrix;
756         upTriFactor->csrMat->num_rows = A->rmap->n;
757         upTriFactor->csrMat->num_cols = A->cmap->n;
758         upTriFactor->csrMat->num_entries = a->nz;
759 
760         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
761         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
762 
763         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
764         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
765 
766         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
767         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
768 
769         /* set the operation */
770         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771 
772         /* Create the solve analysis information */
773         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
774         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
775       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
776         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
777                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
778                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
779                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
780                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
781         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
782       #endif
783 
784         /* perform the solve analysis */
785         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
786                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
787                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
788                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
789                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
790                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
791                                 #endif
792 );CHKERRCUSPARSE(stat);
793         cerr = WaitForCUDA();CHKERRCUDA(cerr);
794         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
795 
796         /* assign the pointer */
797         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
798 
799         /* allocate space for the triangular factor information */
800         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
801         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
802 
803         /* Create the matrix description */
804         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
805         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
806        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
807         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
808        #else
809         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
810        #endif
811         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
812         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
813 
814         /* set the operation */
815         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
816 
817         /* set the matrix */
818         loTriFactor->csrMat = new CsrMatrix;
819         loTriFactor->csrMat->num_rows = A->rmap->n;
820         loTriFactor->csrMat->num_cols = A->cmap->n;
821         loTriFactor->csrMat->num_entries = a->nz;
822 
823         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
824         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
825 
826         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
827         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
828 
829         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
830         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
831 
832         /* Create the solve analysis information */
833         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
834         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
835       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
836         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
837                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
838                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
839                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
840                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
841         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
842       #endif
843 
844         /* perform the solve analysis */
845         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
846                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
847                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
848                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
849                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
850                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
851                                 #endif
852 );CHKERRCUSPARSE(stat);
853         cerr = WaitForCUDA();CHKERRCUDA(cerr);
854         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855 
856         /* assign the pointer */
857         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
858 
859         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
860         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
861         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
862       } else {
863         /* Fill the upper triangular matrix */
864         offset = 0;
865         for (i=0; i<n; i++) {
866           /* set the pointers */
867           v  = aa + ai[i];
868           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
869 
870           /* first, set the diagonal elements */
871           AAUp[offset] = 1.0/v[nz];
872           AALo[offset] = 1.0/v[nz];
873 
874           offset+=1;
875           if (nz>0) {
876             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
877             for (j=offset; j<offset+nz; j++) {
878               AAUp[j] = -AAUp[j];
879               AALo[j] = AAUp[j]/v[nz];
880             }
881             offset+=nz;
882           }
883         }
884         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
886         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
887         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
888         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
889       }
890       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
891       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
892     } catch(char *ex) {
893       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
894     }
895   }
896   PetscFunctionReturn(0);
897 }
898 
899 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
900 {
901   PetscErrorCode               ierr;
902   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
903   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
904   IS                           ip = a->row;
905   PetscBool                    perm_identity;
906   PetscInt                     n = A->rmap->n;
907 
908   PetscFunctionBegin;
909   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
910   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
911   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
912   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
913 
914   A->offloadmask = PETSC_OFFLOAD_BOTH;
915 
916   /* lower triangular indices */
917   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
918   if (!perm_identity) {
919     IS             iip;
920     const PetscInt *irip,*rip;
921 
922     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
923     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
924     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
925     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
926     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
927     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
928     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
929     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
930     ierr = ISDestroy(&iip);CHKERRQ(ierr);
931     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
932     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
933   }
934   PetscFunctionReturn(0);
935 }
936 
937 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
938 {
939   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
940   IS             isrow = b->row,iscol = b->col;
941   PetscBool      row_identity,col_identity;
942   PetscErrorCode ierr;
943 
944   PetscFunctionBegin;
945   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
946   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
947   B->offloadmask = PETSC_OFFLOAD_CPU;
948   /* determine which version of MatSolve needs to be used. */
949   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
950   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
951   if (row_identity && col_identity) {
952     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
953     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
954     B->ops->matsolve = NULL;
955     B->ops->matsolvetranspose = NULL;
956   } else {
957     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
958     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
959     B->ops->matsolve = NULL;
960     B->ops->matsolvetranspose = NULL;
961   }
962 
963   /* get the triangular factors */
964   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
965   PetscFunctionReturn(0);
966 }
967 
968 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
969 {
970   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
971   IS             ip = b->row;
972   PetscBool      perm_identity;
973   PetscErrorCode ierr;
974 
975   PetscFunctionBegin;
976   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
977   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
978   B->offloadmask = PETSC_OFFLOAD_CPU;
979   /* determine which version of MatSolve needs to be used. */
980   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
981   if (perm_identity) {
982     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
984     B->ops->matsolve = NULL;
985     B->ops->matsolvetranspose = NULL;
986   } else {
987     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
988     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
989     B->ops->matsolve = NULL;
990     B->ops->matsolvetranspose = NULL;
991   }
992 
993   /* get the triangular factors */
994   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
995   PetscFunctionReturn(0);
996 }
997 
998 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
999 {
1000   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1004   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1005   cusparseStatus_t                  stat;
1006   cusparseIndexBase_t               indexBase;
1007   cusparseMatrixType_t              matrixType;
1008   cusparseFillMode_t                fillMode;
1009   cusparseDiagType_t                diagType;
1010   cudaError_t                       cerr;
1011   PetscErrorCode                    ierr;
1012 
1013   PetscFunctionBegin;
1014   /* allocate space for the transpose of the lower triangular factor */
1015   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1016   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1017 
1018   /* set the matrix descriptors of the lower triangular factor */
1019   matrixType = cusparseGetMatType(loTriFactor->descr);
1020   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1021   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1022     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1023   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1024 
1025   /* Create the matrix description */
1026   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1030   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1031 
1032   /* set the operation */
1033   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1034 
1035   /* allocate GPU space for the CSC of the lower triangular factor*/
1036   loTriFactorT->csrMat = new CsrMatrix;
1037   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1038   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1039   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1040   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1041   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1042   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1043 
1044   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1045 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1046   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1047                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1048                                        loTriFactor->csrMat->values->data().get(),
1049                                        loTriFactor->csrMat->row_offsets->data().get(),
1050                                        loTriFactor->csrMat->column_indices->data().get(),
1051                                        loTriFactorT->csrMat->values->data().get(),
1052                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1053                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1054                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1055   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1056 #endif
1057 
1058   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1059   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1060                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1061                           loTriFactor->csrMat->values->data().get(),
1062                           loTriFactor->csrMat->row_offsets->data().get(),
1063                           loTriFactor->csrMat->column_indices->data().get(),
1064                           loTriFactorT->csrMat->values->data().get(),
1065                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1066                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1067                           CUSPARSE_ACTION_NUMERIC, indexBase,
1068                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1069                         #else
1070                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1071                           CUSPARSE_ACTION_NUMERIC, indexBase
1072                         #endif
1073 );CHKERRCUSPARSE(stat);
1074   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1075   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1076 
1077   /* Create the solve analysis information */
1078   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1079   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1080 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1081   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1085                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1086   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1087 #endif
1088 
1089   /* perform the solve analysis */
1090   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1092                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1094                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1095                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1096                           #endif
1097 );CHKERRCUSPARSE(stat);
1098   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1099   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1100 
1101   /* assign the pointer */
1102   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1103 
1104   /*********************************************/
1105   /* Now the Transpose of the Upper Tri Factor */
1106   /*********************************************/
1107 
1108   /* allocate space for the transpose of the upper triangular factor */
1109   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1110   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1111 
1112   /* set the matrix descriptors of the upper triangular factor */
1113   matrixType = cusparseGetMatType(upTriFactor->descr);
1114   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1115   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1116     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1117   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1118 
1119   /* Create the matrix description */
1120   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1124   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1125 
1126   /* set the operation */
1127   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1128 
1129   /* allocate GPU space for the CSC of the upper triangular factor*/
1130   upTriFactorT->csrMat = new CsrMatrix;
1131   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1132   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1133   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1134   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1135   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1136   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1137 
1138   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1141                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1142                                 upTriFactor->csrMat->values->data().get(),
1143                                 upTriFactor->csrMat->row_offsets->data().get(),
1144                                 upTriFactor->csrMat->column_indices->data().get(),
1145                                 upTriFactorT->csrMat->values->data().get(),
1146                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1147                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1148                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1149   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1150 #endif
1151 
1152   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1153   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1154                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1155                           upTriFactor->csrMat->values->data().get(),
1156                           upTriFactor->csrMat->row_offsets->data().get(),
1157                           upTriFactor->csrMat->column_indices->data().get(),
1158                           upTriFactorT->csrMat->values->data().get(),
1159                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1160                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1161                           CUSPARSE_ACTION_NUMERIC, indexBase,
1162                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1163                         #else
1164                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165                           CUSPARSE_ACTION_NUMERIC, indexBase
1166                         #endif
1167 );CHKERRCUSPARSE(stat);
1168   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1169   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1170 
1171   /* Create the solve analysis information */
1172   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1173   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1174   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1175   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1176                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1177                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1178                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1179                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1180   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1181   #endif
1182 
1183   /* perform the solve analysis */
1184   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1185                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1186                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1187                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1188                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1189                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1190                           #endif
1191 );CHKERRCUSPARSE(stat);
1192   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1193   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1194 
1195   /* assign the pointer */
1196   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1201 {
1202   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1204   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1205   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1206   cusparseStatus_t             stat;
1207   cusparseIndexBase_t          indexBase;
1208   cudaError_t                  err;
1209   PetscErrorCode               ierr;
1210 
1211   PetscFunctionBegin;
1212   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1213   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1214   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1215   /* create cusparse matrix */
1216   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1217   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1218   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1219   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1220   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1221 
1222   /* set alpha and beta */
1223   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1224   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1225   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1226   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1227   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1230 
1231   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1232     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1233     CsrMatrix *matrixT= new CsrMatrix;
1234     matrixT->num_rows = A->cmap->n;
1235     matrixT->num_cols = A->rmap->n;
1236     matrixT->num_entries = a->nz;
1237     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239     matrixT->values = new THRUSTARRAY(a->nz);
1240 
1241     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1242     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243 
1244     /* compute the transpose, i.e. the CSC */
1245    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1246     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1247                                   A->cmap->n, matrix->num_entries,
1248                                   matrix->values->data().get(),
1249                                   cusparsestruct->rowoffsets_gpu->data().get(),
1250                                   matrix->column_indices->data().get(),
1251                                   matrixT->values->data().get(),
1252                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1253                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1254                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1255     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1256    #endif
1257 
1258     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1259                             A->cmap->n, matrix->num_entries,
1260                             matrix->values->data().get(),
1261                             cusparsestruct->rowoffsets_gpu->data().get(),
1262                             matrix->column_indices->data().get(),
1263                             matrixT->values->data().get(),
1264                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1265                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1266                             CUSPARSE_ACTION_NUMERIC,indexBase,
1267                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1268                           #else
1269                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1270                             CUSPARSE_ACTION_NUMERIC, indexBase
1271                           #endif
1272 );CHKERRCUSPARSE(stat);
1273     matstructT->mat = matrixT;
1274 
1275    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276     stat = cusparseCreateCsr(&matstructT->matDescr,
1277                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1278                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1279                              matrixT->values->data().get(),
1280                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1281                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1282    #endif
1283   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1284    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1285     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1286    #else
1287     CsrMatrix *temp  = new CsrMatrix;
1288     CsrMatrix *tempT = new CsrMatrix;
1289     /* First convert HYB to CSR */
1290     temp->num_rows = A->rmap->n;
1291     temp->num_cols = A->cmap->n;
1292     temp->num_entries = a->nz;
1293     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1294     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1295     temp->values = new THRUSTARRAY(a->nz);
1296 
1297     stat = cusparse_hyb2csr(cusparsestruct->handle,
1298                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1299                             temp->values->data().get(),
1300                             temp->row_offsets->data().get(),
1301                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1302 
1303     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304     tempT->num_rows = A->rmap->n;
1305     tempT->num_cols = A->cmap->n;
1306     tempT->num_entries = a->nz;
1307     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1308     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309     tempT->values = new THRUSTARRAY(a->nz);
1310 
1311     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1312                             temp->num_cols, temp->num_entries,
1313                             temp->values->data().get(),
1314                             temp->row_offsets->data().get(),
1315                             temp->column_indices->data().get(),
1316                             tempT->values->data().get(),
1317                             tempT->column_indices->data().get(),
1318                             tempT->row_offsets->data().get(),
1319                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1320 
1321     /* Last, convert CSC to HYB */
1322     cusparseHybMat_t hybMat;
1323     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1324     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1325       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1326     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1327                             matstructT->descr, tempT->values->data().get(),
1328                             tempT->row_offsets->data().get(),
1329                             tempT->column_indices->data().get(),
1330                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1331 
1332     /* assign the pointer */
1333     matstructT->mat = hybMat;
1334     /* delete temporaries */
1335     if (tempT) {
1336       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1337       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1338       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1339       delete (CsrMatrix*) tempT;
1340     }
1341     if (temp) {
1342       if (temp->values) delete (THRUSTARRAY*) temp->values;
1343       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1344       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1345       delete (CsrMatrix*) temp;
1346     }
1347    #endif
1348   }
1349   err  = WaitForCUDA();CHKERRCUDA(err);
1350   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1351   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1352   /* the compressed row indices is not used for matTranspose */
1353   matstructT->cprowIndices = NULL;
1354   /* assign the pointer */
1355   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1360 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1361 {
1362   PetscInt                              n = xx->map->n;
1363   const PetscScalar                     *barray;
1364   PetscScalar                           *xarray;
1365   thrust::device_ptr<const PetscScalar> bGPU;
1366   thrust::device_ptr<PetscScalar>       xGPU;
1367   cusparseStatus_t                      stat;
1368   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1369   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1370   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1371   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1372   PetscErrorCode                        ierr;
1373   cudaError_t                           cerr;
1374 
1375   PetscFunctionBegin;
1376   /* Analyze the matrix and create the transpose ... on the fly */
1377   if (!loTriFactorT && !upTriFactorT) {
1378     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1379     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1380     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1381   }
1382 
1383   /* Get the GPU pointers */
1384   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1385   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1386   xGPU = thrust::device_pointer_cast(xarray);
1387   bGPU = thrust::device_pointer_cast(barray);
1388 
1389   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1390   /* First, reorder with the row permutation */
1391   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1392                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1393                xGPU);
1394 
1395   /* First, solve U */
1396   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1397                         upTriFactorT->csrMat->num_rows,
1398                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1399                         upTriFactorT->csrMat->num_entries,
1400                       #endif
1401                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1402                         upTriFactorT->csrMat->values->data().get(),
1403                         upTriFactorT->csrMat->row_offsets->data().get(),
1404                         upTriFactorT->csrMat->column_indices->data().get(),
1405                         upTriFactorT->solveInfo,
1406                         xarray, tempGPU->data().get()
1407                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1408                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1409                       #endif
1410 );CHKERRCUSPARSE(stat);
1411 
1412   /* Then, solve L */
1413   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1414                         loTriFactorT->csrMat->num_rows,
1415                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1416                         loTriFactorT->csrMat->num_entries,
1417                       #endif
1418                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1419                         loTriFactorT->csrMat->values->data().get(),
1420                         loTriFactorT->csrMat->row_offsets->data().get(),
1421                         loTriFactorT->csrMat->column_indices->data().get(),
1422                         loTriFactorT->solveInfo,
1423                         tempGPU->data().get(), xarray
1424                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1425                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1426                       #endif
1427 );CHKERRCUSPARSE(stat);
1428 
1429   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1430   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1431                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1432                tempGPU->begin());
1433 
1434   /* Copy the temporary to the full solution. */
1435   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1436 
1437   /* restore */
1438   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1439   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1440   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1441   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1442   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1443   PetscFunctionReturn(0);
1444 }
1445 
1446 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1447 {
1448   const PetscScalar                 *barray;
1449   PetscScalar                       *xarray;
1450   cusparseStatus_t                  stat;
1451   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1452   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1453   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1454   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1455   PetscErrorCode                    ierr;
1456   cudaError_t                       cerr;
1457 
1458   PetscFunctionBegin;
1459   /* Analyze the matrix and create the transpose ... on the fly */
1460   if (!loTriFactorT && !upTriFactorT) {
1461     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1462     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1463     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1464   }
1465 
1466   /* Get the GPU pointers */
1467   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1468   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1469 
1470   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1471   /* First, solve U */
1472   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1473                         upTriFactorT->csrMat->num_rows,
1474                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475                         upTriFactorT->csrMat->num_entries,
1476                       #endif
1477                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1478                         upTriFactorT->csrMat->values->data().get(),
1479                         upTriFactorT->csrMat->row_offsets->data().get(),
1480                         upTriFactorT->csrMat->column_indices->data().get(),
1481                         upTriFactorT->solveInfo,
1482                         barray, tempGPU->data().get()
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1485                       #endif
1486 );CHKERRCUSPARSE(stat);
1487 
1488   /* Then, solve L */
1489   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1490                         loTriFactorT->csrMat->num_rows,
1491                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1492                         loTriFactorT->csrMat->num_entries,
1493                       #endif
1494                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1495                         loTriFactorT->csrMat->values->data().get(),
1496                         loTriFactorT->csrMat->row_offsets->data().get(),
1497                         loTriFactorT->csrMat->column_indices->data().get(),
1498                         loTriFactorT->solveInfo,
1499                         tempGPU->data().get(), xarray
1500                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1501                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1502                       #endif
1503 );CHKERRCUSPARSE(stat);
1504 
1505   /* restore */
1506   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1507   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1508   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1509   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1510   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1515 {
1516   const PetscScalar                     *barray;
1517   PetscScalar                           *xarray;
1518   thrust::device_ptr<const PetscScalar> bGPU;
1519   thrust::device_ptr<PetscScalar>       xGPU;
1520   cusparseStatus_t                      stat;
1521   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1522   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1523   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1524   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1525   PetscErrorCode                        ierr;
1526   cudaError_t                           cerr;
1527 
1528   PetscFunctionBegin;
1529 
1530   /* Get the GPU pointers */
1531   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1532   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1533   xGPU = thrust::device_pointer_cast(xarray);
1534   bGPU = thrust::device_pointer_cast(barray);
1535 
1536   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1537   /* First, reorder with the row permutation */
1538   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1539                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1540                tempGPU->begin());
1541 
1542   /* Next, solve L */
1543   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1544                         loTriFactor->csrMat->num_rows,
1545                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546                         loTriFactor->csrMat->num_entries,
1547                       #endif
1548                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1549                         loTriFactor->csrMat->values->data().get(),
1550                         loTriFactor->csrMat->row_offsets->data().get(),
1551                         loTriFactor->csrMat->column_indices->data().get(),
1552                         loTriFactor->solveInfo,
1553                         tempGPU->data().get(), xarray
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1556                       #endif
1557 );CHKERRCUSPARSE(stat);
1558 
1559   /* Then, solve U */
1560   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1561                         upTriFactor->csrMat->num_rows,
1562                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563                         upTriFactor->csrMat->num_entries,
1564                       #endif
1565                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1566                         upTriFactor->csrMat->values->data().get(),
1567                         upTriFactor->csrMat->row_offsets->data().get(),
1568                         upTriFactor->csrMat->column_indices->data().get(),
1569                         upTriFactor->solveInfo,
1570                         xarray, tempGPU->data().get()
1571                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1572                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1573                       #endif
1574 );CHKERRCUSPARSE(stat);
1575 
1576   /* Last, reorder with the column permutation */
1577   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1578                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1579                xGPU);
1580 
1581   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1582   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1583   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1584   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1590 {
1591   const PetscScalar                 *barray;
1592   PetscScalar                       *xarray;
1593   cusparseStatus_t                  stat;
1594   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1595   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1596   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1597   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1598   PetscErrorCode                    ierr;
1599   cudaError_t                       cerr;
1600 
1601   PetscFunctionBegin;
1602   /* Get the GPU pointers */
1603   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1604   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1605 
1606   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1607   /* First, solve L */
1608   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1609                         loTriFactor->csrMat->num_rows,
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         loTriFactor->csrMat->num_entries,
1612                       #endif
1613                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1614                         loTriFactor->csrMat->values->data().get(),
1615                         loTriFactor->csrMat->row_offsets->data().get(),
1616                         loTriFactor->csrMat->column_indices->data().get(),
1617                         loTriFactor->solveInfo,
1618                         barray, tempGPU->data().get()
1619                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1621                       #endif
1622 );CHKERRCUSPARSE(stat);
1623 
1624   /* Next, solve U */
1625   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1626                         upTriFactor->csrMat->num_rows,
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         upTriFactor->csrMat->num_entries,
1629                       #endif
1630                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1631                         upTriFactor->csrMat->values->data().get(),
1632                         upTriFactor->csrMat->row_offsets->data().get(),
1633                         upTriFactor->csrMat->column_indices->data().get(),
1634                         upTriFactor->solveInfo,
1635                         tempGPU->data().get(), xarray
1636                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1638                       #endif
1639 );CHKERRCUSPARSE(stat);
1640 
1641   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1642   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1643   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1644   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1650 {
1651   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1652   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1653   cudaError_t        cerr;
1654   PetscErrorCode     ierr;
1655 
1656   PetscFunctionBegin;
1657   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1658     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1659 
1660     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1661     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1662     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1663     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1664     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1665     A->offloadmask = PETSC_OFFLOAD_BOTH;
1666   }
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1671 {
1672   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1673   PetscErrorCode ierr;
1674 
1675   PetscFunctionBegin;
1676   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1677   *array = a->a;
1678   A->offloadmask = PETSC_OFFLOAD_CPU;
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1683 {
1684   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1685   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1686   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1687   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1688   PetscErrorCode               ierr;
1689   cusparseStatus_t             stat;
1690   PetscBool                    both = PETSC_TRUE;
1691   cudaError_t                  err;
1692 
1693   PetscFunctionBegin;
1694   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1695   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1696     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1697       /* Copy values only */
1698       CsrMatrix *matrix,*matrixT;
1699       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1700 
1701       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1702       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1703       matrix->values->assign(a->a, a->a+a->nz);
1704       err  = WaitForCUDA();CHKERRCUDA(err);
1705       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1706       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1707 
1708       /* Update matT when it was built before */
1709       if (cusparsestruct->matTranspose) {
1710         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1711         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1712         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1713         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1714                             A->cmap->n, matrix->num_entries,
1715                             matrix->values->data().get(),
1716                             cusparsestruct->rowoffsets_gpu->data().get(),
1717                             matrix->column_indices->data().get(),
1718                             matrixT->values->data().get(),
1719                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1720                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1721                             CUSPARSE_ACTION_NUMERIC,indexBase,
1722                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1723                           #else
1724                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1725                             CUSPARSE_ACTION_NUMERIC, indexBase
1726                           #endif
1727 );CHKERRCUSPARSE(stat);
1728         err  = WaitForCUDA();CHKERRCUDA(err);
1729         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1730       }
1731     } else {
1732       PetscInt nnz;
1733       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1734       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1735       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1736       delete cusparsestruct->workVector;
1737       delete cusparsestruct->rowoffsets_gpu;
1738       try {
1739         if (a->compressedrow.use) {
1740           m    = a->compressedrow.nrows;
1741           ii   = a->compressedrow.i;
1742           ridx = a->compressedrow.rindex;
1743         } else {
1744           m    = A->rmap->n;
1745           ii   = a->i;
1746           ridx = NULL;
1747         }
1748         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1749         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1750         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1751         else nnz = a->nz;
1752 
1753         /* create cusparse matrix */
1754         cusparsestruct->nrows = m;
1755         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1756         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1757         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1758         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1759 
1760         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1761         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1762         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1763         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1764         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1765         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1766         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1767 
1768         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1769         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1770           /* set the matrix */
1771           CsrMatrix *mat= new CsrMatrix;
1772           mat->num_rows = m;
1773           mat->num_cols = A->cmap->n;
1774           mat->num_entries = nnz;
1775           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1776           mat->row_offsets->assign(ii, ii + m+1);
1777 
1778           mat->column_indices = new THRUSTINTARRAY32(nnz);
1779           mat->column_indices->assign(a->j, a->j+nnz);
1780 
1781           mat->values = new THRUSTARRAY(nnz);
1782           if (a->a) mat->values->assign(a->a, a->a+nnz);
1783 
1784           /* assign the pointer */
1785           matstruct->mat = mat;
1786          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1787           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1788             stat = cusparseCreateCsr(&matstruct->matDescr,
1789                                     mat->num_rows, mat->num_cols, mat->num_entries,
1790                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1791                                     mat->values->data().get(),
1792                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1793                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1794           }
1795          #endif
1796         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1797          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1798           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1799          #else
1800           CsrMatrix *mat= new CsrMatrix;
1801           mat->num_rows = m;
1802           mat->num_cols = A->cmap->n;
1803           mat->num_entries = nnz;
1804           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1805           mat->row_offsets->assign(ii, ii + m+1);
1806 
1807           mat->column_indices = new THRUSTINTARRAY32(nnz);
1808           mat->column_indices->assign(a->j, a->j+nnz);
1809 
1810           mat->values = new THRUSTARRAY(nnz);
1811           if (a->a) mat->values->assign(a->a, a->a+nnz);
1812 
1813           cusparseHybMat_t hybMat;
1814           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1815           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1816             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1817           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1818               matstruct->descr, mat->values->data().get(),
1819               mat->row_offsets->data().get(),
1820               mat->column_indices->data().get(),
1821               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1822           /* assign the pointer */
1823           matstruct->mat = hybMat;
1824 
1825           if (mat) {
1826             if (mat->values) delete (THRUSTARRAY*)mat->values;
1827             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1828             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1829             delete (CsrMatrix*)mat;
1830           }
1831          #endif
1832         }
1833 
1834         /* assign the compressed row indices */
1835         if (a->compressedrow.use) {
1836           cusparsestruct->workVector = new THRUSTARRAY(m);
1837           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1838           matstruct->cprowIndices->assign(ridx,ridx+m);
1839           tmp = m;
1840         } else {
1841           cusparsestruct->workVector = NULL;
1842           matstruct->cprowIndices    = NULL;
1843           tmp = 0;
1844         }
1845         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1846 
1847         /* assign the pointer */
1848         cusparsestruct->mat = matstruct;
1849       } catch(char *ex) {
1850         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1851       }
1852       err  = WaitForCUDA();CHKERRCUDA(err);
1853       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1854       cusparsestruct->nonzerostate = A->nonzerostate;
1855     }
1856     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 struct VecCUDAPlusEquals
1862 {
1863   template <typename Tuple>
1864   __host__ __device__
1865   void operator()(Tuple t)
1866   {
1867     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1868   }
1869 };
1870 
1871 struct VecCUDAEquals
1872 {
1873   template <typename Tuple>
1874   __host__ __device__
1875   void operator()(Tuple t)
1876   {
1877     thrust::get<1>(t) = thrust::get<0>(t);
1878   }
1879 };
1880 
1881 struct VecCUDAEqualsReverse
1882 {
1883   template <typename Tuple>
1884   __host__ __device__
1885   void operator()(Tuple t)
1886   {
1887     thrust::get<0>(t) = thrust::get<1>(t);
1888   }
1889 };
1890 
1891 struct MatMatCusparse {
1892   PetscBool             cisdense;
1893   PetscScalar           *Bt;
1894   Mat                   X;
1895   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1896   PetscLogDouble        flops;
1897   CsrMatrix             *Bcsr;
1898 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1899   cusparseSpMatDescr_t  matSpBDescr;
1900   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1901   cusparseDnMatDescr_t  matBDescr;
1902   cusparseDnMatDescr_t  matCDescr;
1903   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1904   size_t                mmBufferSize;
1905   void                  *mmBuffer;
1906   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1907   cusparseSpGEMMDescr_t spgemmDesc;
1908 #endif
1909 };
1910 
1911 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1912 {
1913   PetscErrorCode   ierr;
1914   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1915   cudaError_t      cerr;
1916  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917   cusparseStatus_t stat;
1918  #endif
1919 
1920   PetscFunctionBegin;
1921   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1922   delete mmdata->Bcsr;
1923  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1924   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1925   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1926   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1927   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1928   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1929   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1930  #endif
1931   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1932   ierr = PetscFree(data);CHKERRQ(ierr);
1933   PetscFunctionReturn(0);
1934 }
1935 
1936 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1937 
1938 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1939 {
1940   Mat_Product                  *product = C->product;
1941   Mat                          A,B;
1942   PetscInt                     m,n,blda,clda;
1943   PetscBool                    flg,biscuda;
1944   Mat_SeqAIJCUSPARSE           *cusp;
1945   cusparseStatus_t             stat;
1946   cusparseOperation_t          opA;
1947   const PetscScalar            *barray;
1948   PetscScalar                  *carray;
1949   PetscErrorCode               ierr;
1950   MatMatCusparse               *mmdata;
1951   Mat_SeqAIJCUSPARSEMultStruct *mat;
1952   CsrMatrix                    *csrmat;
1953   cudaError_t                  cerr;
1954 
1955   PetscFunctionBegin;
1956   MatCheckProduct(C,1);
1957   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1958   mmdata = (MatMatCusparse*)product->data;
1959   A    = product->A;
1960   B    = product->B;
1961   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1962   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1963   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1964      Instead of silently accepting the wrong answer, I prefer to raise the error */
1965   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1966   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1967   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1968   switch (product->type) {
1969   case MATPRODUCT_AB:
1970   case MATPRODUCT_PtAP:
1971     mat = cusp->mat;
1972     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1973     m   = A->rmap->n;
1974     n   = B->cmap->n;
1975     break;
1976   case MATPRODUCT_AtB:
1977     if (!cusp->transgen) {
1978       mat = cusp->mat;
1979       opA = CUSPARSE_OPERATION_TRANSPOSE;
1980     } else {
1981       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1982       mat  = cusp->matTranspose;
1983       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1984     }
1985     m = A->cmap->n;
1986     n = B->cmap->n;
1987     break;
1988   case MATPRODUCT_ABt:
1989   case MATPRODUCT_RARt:
1990     mat = cusp->mat;
1991     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1992     m   = A->rmap->n;
1993     n   = B->rmap->n;
1994     break;
1995   default:
1996     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1997   }
1998   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1999   csrmat = (CsrMatrix*)mat->mat;
2000   /* if the user passed a CPU matrix, copy the data to the GPU */
2001   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2002   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2003   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2004 
2005   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2006   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2007     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2008     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2009   } else {
2010     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2011     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2012   }
2013 
2014   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2015  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2016   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2017   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2018   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2019     size_t mmBufferSize;
2020     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2021     if (!mmdata->matBDescr) {
2022       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2023       mmdata->Blda = blda;
2024     }
2025 
2026     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2027     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2028       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2029       mmdata->Clda = clda;
2030     }
2031 
2032     if (!mat->matDescr) {
2033       stat = cusparseCreateCsr(&mat->matDescr,
2034                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2035                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2036                                csrmat->values->data().get(),
2037                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2038                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2039     }
2040     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2041                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2042                                    mmdata->matCDescr,cusparse_scalartype,
2043                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2044     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2045       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2046       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2047       mmdata->mmBufferSize = mmBufferSize;
2048     }
2049     mmdata->initialized = PETSC_TRUE;
2050   } else {
2051     /* to be safe, always update pointers of the mats */
2052     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2053     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2054     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2055   }
2056 
2057   /* do cusparseSpMM, which supports transpose on B */
2058   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2059                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2060                       mmdata->matCDescr,cusparse_scalartype,
2061                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2062  #else
2063   PetscInt k;
2064   /* cusparseXcsrmm does not support transpose on B */
2065   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2066     cublasHandle_t cublasv2handle;
2067     cublasStatus_t cerr;
2068 
2069     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2070     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2071                        B->cmap->n,B->rmap->n,
2072                        &PETSC_CUSPARSE_ONE ,barray,blda,
2073                        &PETSC_CUSPARSE_ZERO,barray,blda,
2074                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2075     blda = B->cmap->n;
2076     k    = B->cmap->n;
2077   } else {
2078     k    = B->rmap->n;
2079   }
2080 
2081   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2082   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2083                            csrmat->num_entries,mat->alpha_one,mat->descr,
2084                            csrmat->values->data().get(),
2085                            csrmat->row_offsets->data().get(),
2086                            csrmat->column_indices->data().get(),
2087                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2088                            carray,clda);CHKERRCUSPARSE(stat);
2089  #endif
2090   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2091   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2092   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2093   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2094   if (product->type == MATPRODUCT_RARt) {
2095     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2096     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2097   } else if (product->type == MATPRODUCT_PtAP) {
2098     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2099     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2100   } else {
2101     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2102   }
2103   if (mmdata->cisdense) {
2104     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2105   }
2106   if (!biscuda) {
2107     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2108   }
2109   PetscFunctionReturn(0);
2110 }
2111 
2112 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2113 {
2114   Mat_Product        *product = C->product;
2115   Mat                A,B;
2116   PetscInt           m,n;
2117   PetscBool          cisdense,flg;
2118   PetscErrorCode     ierr;
2119   MatMatCusparse     *mmdata;
2120   Mat_SeqAIJCUSPARSE *cusp;
2121 
2122   PetscFunctionBegin;
2123   MatCheckProduct(C,1);
2124   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2125   A    = product->A;
2126   B    = product->B;
2127   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2128   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2129   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2130   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2131   switch (product->type) {
2132   case MATPRODUCT_AB:
2133     m = A->rmap->n;
2134     n = B->cmap->n;
2135     break;
2136   case MATPRODUCT_AtB:
2137     m = A->cmap->n;
2138     n = B->cmap->n;
2139     break;
2140   case MATPRODUCT_ABt:
2141     m = A->rmap->n;
2142     n = B->rmap->n;
2143     break;
2144   case MATPRODUCT_PtAP:
2145     m = B->cmap->n;
2146     n = B->cmap->n;
2147     break;
2148   case MATPRODUCT_RARt:
2149     m = B->rmap->n;
2150     n = B->rmap->n;
2151     break;
2152   default:
2153     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2154   }
2155   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2156   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2157   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2158   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2159 
2160   /* product data */
2161   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2162   mmdata->cisdense = cisdense;
2163  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2164   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2165   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2166     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2167   }
2168  #endif
2169   /* for these products we need intermediate storage */
2170   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2171     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2172     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2173     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2174       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2175     } else {
2176       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2177     }
2178   }
2179   C->product->data    = mmdata;
2180   C->product->destroy = MatDestroy_MatMatCusparse;
2181 
2182   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2183   PetscFunctionReturn(0);
2184 }
2185 
2186 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2187 {
2188   Mat_Product                  *product = C->product;
2189   Mat                          A,B;
2190   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2191   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2192   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2193   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2194   PetscBool                    flg;
2195   PetscErrorCode               ierr;
2196   cusparseStatus_t             stat;
2197   cudaError_t                  cerr;
2198   MatProductType               ptype;
2199   MatMatCusparse               *mmdata;
2200 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201   cusparseSpMatDescr_t         BmatSpDescr;
2202 #endif
2203 
2204   PetscFunctionBegin;
2205   MatCheckProduct(C,1);
2206   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2207   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2208   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2209   mmdata = (MatMatCusparse*)C->product->data;
2210   A = product->A;
2211   B = product->B;
2212   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2213     mmdata->reusesym = PETSC_FALSE;
2214     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2215     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2216     Cmat = Ccusp->mat;
2217     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2218     Ccsr = (CsrMatrix*)Cmat->mat;
2219     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2220     goto finalize;
2221   }
2222   if (!c->nz) goto finalize;
2223   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2224   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2225   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2226   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2227   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2228   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2229   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2230   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2231   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2232   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2233   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2234   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2235   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2236   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2237 
2238   ptype = product->type;
2239   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2240   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2241   switch (ptype) {
2242   case MATPRODUCT_AB:
2243     Amat = Acusp->mat;
2244     Bmat = Bcusp->mat;
2245     break;
2246   case MATPRODUCT_AtB:
2247     Amat = Acusp->matTranspose;
2248     Bmat = Bcusp->mat;
2249     break;
2250   case MATPRODUCT_ABt:
2251     Amat = Acusp->mat;
2252     Bmat = Bcusp->matTranspose;
2253     break;
2254   default:
2255     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2256   }
2257   Cmat = Ccusp->mat;
2258   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2259   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2260   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2261   Acsr = (CsrMatrix*)Amat->mat;
2262   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2263   Ccsr = (CsrMatrix*)Cmat->mat;
2264   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2265   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2266   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2267   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2269   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2270   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2271                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2272                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2273                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2274   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2275                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2276                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2277 #else
2278   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2279                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2280                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2281                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2282                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2283 #endif
2284   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2285   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2286   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2287   C->offloadmask = PETSC_OFFLOAD_GPU;
2288 finalize:
2289   /* shorter version of MatAssemblyEnd_SeqAIJ */
2290   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2291   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2292   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2293   c->reallocs         = 0;
2294   C->info.mallocs    += 0;
2295   C->info.nz_unneeded = 0;
2296   C->assembled = C->was_assembled = PETSC_TRUE;
2297   C->num_ass++;
2298   PetscFunctionReturn(0);
2299 }
2300 
2301 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2302 {
2303   Mat_Product                  *product = C->product;
2304   Mat                          A,B;
2305   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2306   Mat_SeqAIJ                   *a,*b,*c;
2307   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2308   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2309   PetscInt                     i,j,m,n,k;
2310   PetscBool                    flg;
2311   PetscErrorCode               ierr;
2312   cusparseStatus_t             stat;
2313   cudaError_t                  cerr;
2314   MatProductType               ptype;
2315   MatMatCusparse               *mmdata;
2316   PetscLogDouble               flops;
2317   PetscBool                    biscompressed,ciscompressed;
2318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2319   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2320   size_t                       bufSize2;
2321   cusparseSpMatDescr_t         BmatSpDescr;
2322 #else
2323   int                          cnz;
2324 #endif
2325 
2326   PetscFunctionBegin;
2327   MatCheckProduct(C,1);
2328   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2329   A    = product->A;
2330   B    = product->B;
2331   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2332   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2333   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2334   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2335   a = (Mat_SeqAIJ*)A->data;
2336   b = (Mat_SeqAIJ*)B->data;
2337   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2338   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2339   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2340   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2341 
2342   /* product data */
2343   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2344   C->product->data    = mmdata;
2345   C->product->destroy = MatDestroy_MatMatCusparse;
2346 
2347   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2348   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2349   ptype = product->type;
2350   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2351   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2352   biscompressed = PETSC_FALSE;
2353   ciscompressed = PETSC_FALSE;
2354   switch (ptype) {
2355   case MATPRODUCT_AB:
2356     m = A->rmap->n;
2357     n = B->cmap->n;
2358     k = A->cmap->n;
2359     Amat = Acusp->mat;
2360     Bmat = Bcusp->mat;
2361     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2362     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2363     break;
2364   case MATPRODUCT_AtB:
2365     m = A->cmap->n;
2366     n = B->cmap->n;
2367     k = A->rmap->n;
2368     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2369     Amat = Acusp->matTranspose;
2370     Bmat = Bcusp->mat;
2371     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2372     break;
2373   case MATPRODUCT_ABt:
2374     m = A->rmap->n;
2375     n = B->rmap->n;
2376     k = A->cmap->n;
2377     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2378     Amat = Acusp->mat;
2379     Bmat = Bcusp->matTranspose;
2380     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2381     break;
2382   default:
2383     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2384   }
2385 
2386   /* create cusparse matrix */
2387   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2388   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2389   c     = (Mat_SeqAIJ*)C->data;
2390   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2391   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2392   Ccsr  = new CsrMatrix;
2393 
2394   c->compressedrow.use = ciscompressed;
2395   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2396     c->compressedrow.nrows = a->compressedrow.nrows;
2397     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2398     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2399     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2400     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2401     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2402   } else {
2403     c->compressedrow.nrows  = 0;
2404     c->compressedrow.i      = NULL;
2405     c->compressedrow.rindex = NULL;
2406     Ccusp->workVector       = NULL;
2407     Cmat->cprowIndices      = NULL;
2408   }
2409   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2410   Ccusp->mat      = Cmat;
2411   Ccusp->mat->mat = Ccsr;
2412   Ccsr->num_rows    = Ccusp->nrows;
2413   Ccsr->num_cols    = n;
2414   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2415   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2416   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2417   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2418   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2419   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2420   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2421   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2422   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2423   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2424   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2425     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2426     c->nz = 0;
2427     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2428     Ccsr->values = new THRUSTARRAY(c->nz);
2429     goto finalizesym;
2430   }
2431 
2432   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2433   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2434   Acsr = (CsrMatrix*)Amat->mat;
2435   if (!biscompressed) {
2436     Bcsr = (CsrMatrix*)Bmat->mat;
2437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2438     BmatSpDescr = Bmat->matDescr;
2439 #endif
2440   } else { /* we need to use row offsets for the full matrix */
2441     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2442     Bcsr = new CsrMatrix;
2443     Bcsr->num_rows       = B->rmap->n;
2444     Bcsr->num_cols       = cBcsr->num_cols;
2445     Bcsr->num_entries    = cBcsr->num_entries;
2446     Bcsr->column_indices = cBcsr->column_indices;
2447     Bcsr->values         = cBcsr->values;
2448     if (!Bcusp->rowoffsets_gpu) {
2449       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2450       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2451       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2452     }
2453     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2454     mmdata->Bcsr = Bcsr;
2455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2456     if (Bcsr->num_rows && Bcsr->num_cols) {
2457       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2458                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2459                                Bcsr->values->data().get(),
2460                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2461                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2462     }
2463     BmatSpDescr = mmdata->matSpBDescr;
2464 #endif
2465   }
2466   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2467   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2468   /* precompute flops count */
2469   if (ptype == MATPRODUCT_AB) {
2470     for (i=0, flops = 0; i<A->rmap->n; i++) {
2471       const PetscInt st = a->i[i];
2472       const PetscInt en = a->i[i+1];
2473       for (j=st; j<en; j++) {
2474         const PetscInt brow = a->j[j];
2475         flops += 2.*(b->i[brow+1] - b->i[brow]);
2476       }
2477     }
2478   } else if (ptype == MATPRODUCT_AtB) {
2479     for (i=0, flops = 0; i<A->rmap->n; i++) {
2480       const PetscInt anzi = a->i[i+1] - a->i[i];
2481       const PetscInt bnzi = b->i[i+1] - b->i[i];
2482       flops += (2.*anzi)*bnzi;
2483     }
2484   } else { /* TODO */
2485     flops = 0.;
2486   }
2487 
2488   mmdata->flops = flops;
2489   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2490 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2491   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2492   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2493                            NULL, NULL, NULL,
2494                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2497   /* ask bufferSize bytes for external memory */
2498   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2499                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2500                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2501                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2502   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2503   /* inspect the matrices A and B to understand the memory requirement for the next step */
2504   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2505                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2506                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2507                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2508   /* ask bufferSize again bytes for external memory */
2509   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2510                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2511                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2512                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2513   /* The CUSPARSE documentation is not clear, nor the API
2514      We need both buffers to perform the operations properly!
2515      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2516      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2517      is stored in the descriptor! What a messy API... */
2518   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2519   /* compute the intermediate product of A * B */
2520   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2521                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2522                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2523                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2524   /* get matrix C non-zero entries C_nnz1 */
2525   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2526   c->nz = (PetscInt) C_nnz1;
2527   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2528   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2529   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2530   Ccsr->values = new THRUSTARRAY(c->nz);
2531   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2532   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2533                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2534   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2535                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2536                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2537 #else
2538   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2539   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2540                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2541                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2542                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2543                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2544   c->nz = cnz;
2545   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2546   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2547   Ccsr->values = new THRUSTARRAY(c->nz);
2548   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2549 
2550   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2551   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2552      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2553      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2554   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2556                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2557                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2558                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2559 #endif
2560   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2561   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2562   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2563 finalizesym:
2564   c->singlemalloc = PETSC_FALSE;
2565   c->free_a       = PETSC_TRUE;
2566   c->free_ij      = PETSC_TRUE;
2567   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2568   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2569   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2570     PetscInt *d_i = c->i;
2571     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2572     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2573     ii   = *Ccsr->row_offsets;
2574     jj   = *Ccsr->column_indices;
2575     if (ciscompressed) d_i = c->compressedrow.i;
2576     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2577     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2578   } else {
2579     PetscInt *d_i = c->i;
2580     if (ciscompressed) d_i = c->compressedrow.i;
2581     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2582     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2583   }
2584   if (ciscompressed) { /* need to expand host row offsets */
2585     PetscInt r = 0;
2586     c->i[0] = 0;
2587     for (k = 0; k < c->compressedrow.nrows; k++) {
2588       const PetscInt next = c->compressedrow.rindex[k];
2589       const PetscInt old = c->compressedrow.i[k];
2590       for (; r < next; r++) c->i[r+1] = old;
2591     }
2592     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2593   }
2594   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2595   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2596   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2597   c->maxnz = c->nz;
2598   c->nonzerorowcnt = 0;
2599   c->rmax = 0;
2600   for (k = 0; k < m; k++) {
2601     const PetscInt nn = c->i[k+1] - c->i[k];
2602     c->ilen[k] = c->imax[k] = nn;
2603     c->nonzerorowcnt += (PetscInt)!!nn;
2604     c->rmax = PetscMax(c->rmax,nn);
2605   }
2606   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2607   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2608   Ccsr->num_entries = c->nz;
2609 
2610   C->nonzerostate++;
2611   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2612   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2613   Ccusp->nonzerostate = C->nonzerostate;
2614   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2615   C->preallocated  = PETSC_TRUE;
2616   C->assembled     = PETSC_FALSE;
2617   C->was_assembled = PETSC_FALSE;
2618   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2619     mmdata->reusesym = PETSC_TRUE;
2620     C->offloadmask   = PETSC_OFFLOAD_GPU;
2621   }
2622   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2623   PetscFunctionReturn(0);
2624 }
2625 
2626 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2627 
2628 /* handles sparse or dense B */
2629 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2630 {
2631   Mat_Product    *product = mat->product;
2632   PetscErrorCode ierr;
2633   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2634 
2635   PetscFunctionBegin;
2636   MatCheckProduct(mat,1);
2637   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2638   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2639     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2640   }
2641   if (product->type == MATPRODUCT_ABC) {
2642     Ciscusp = PETSC_FALSE;
2643     if (!product->C->boundtocpu) {
2644       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2645     }
2646   }
2647   if (isdense) {
2648     switch (product->type) {
2649     case MATPRODUCT_AB:
2650     case MATPRODUCT_AtB:
2651     case MATPRODUCT_ABt:
2652     case MATPRODUCT_PtAP:
2653     case MATPRODUCT_RARt:
2654      if (product->A->boundtocpu) {
2655         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2656       } else {
2657         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2658       }
2659       break;
2660     case MATPRODUCT_ABC:
2661       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2662       break;
2663     default:
2664       break;
2665     }
2666   } else if (Biscusp && Ciscusp) {
2667     switch (product->type) {
2668     case MATPRODUCT_AB:
2669     case MATPRODUCT_AtB:
2670     case MATPRODUCT_ABt:
2671       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2672       break;
2673     case MATPRODUCT_PtAP:
2674     case MATPRODUCT_RARt:
2675     case MATPRODUCT_ABC:
2676       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2677       break;
2678     default:
2679       break;
2680     }
2681   } else { /* fallback for AIJ */
2682     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2683   }
2684   PetscFunctionReturn(0);
2685 }
2686 
2687 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2688 {
2689   PetscErrorCode ierr;
2690 
2691   PetscFunctionBegin;
2692   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2693   PetscFunctionReturn(0);
2694 }
2695 
2696 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2697 {
2698   PetscErrorCode ierr;
2699 
2700   PetscFunctionBegin;
2701   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2702   PetscFunctionReturn(0);
2703 }
2704 
2705 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2706 {
2707   PetscErrorCode ierr;
2708 
2709   PetscFunctionBegin;
2710   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2715 {
2716   PetscErrorCode ierr;
2717 
2718   PetscFunctionBegin;
2719   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2720   PetscFunctionReturn(0);
2721 }
2722 
2723 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2724 {
2725   PetscErrorCode ierr;
2726 
2727   PetscFunctionBegin;
2728   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2729   PetscFunctionReturn(0);
2730 }
2731 
2732 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2733 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2734 {
2735   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2736   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2737   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2738   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2739   PetscErrorCode               ierr;
2740   cudaError_t                  cerr;
2741   cusparseStatus_t             stat;
2742   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2743   PetscBool                    compressed;
2744 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2745   PetscInt                     nx,ny;
2746 #endif
2747 
2748   PetscFunctionBegin;
2749   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2750   if (!a->nonzerorowcnt) {
2751     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2752     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2753     PetscFunctionReturn(0);
2754   }
2755   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2756   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2757   if (!trans) {
2758     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2759     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2760   } else {
2761     if (herm || !cusparsestruct->transgen) {
2762       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2763       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2764     } else {
2765       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2766       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2767     }
2768   }
2769   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2770   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2771 
2772   try {
2773     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2774     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2775     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2776 
2777     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2778     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2779       /* z = A x + beta y.
2780          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2781          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2782       */
2783       xptr = xarray;
2784       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2785       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2786      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2787       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2788           allocated to accommodate different uses. So we get the length info directly from mat.
2789        */
2790       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2791         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2792         nx = mat->num_cols;
2793         ny = mat->num_rows;
2794       }
2795      #endif
2796     } else {
2797       /* z = A^T x + beta y
2798          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2799          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2800        */
2801       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2802       dptr = zarray;
2803       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2804       if (compressed) { /* Scatter x to work vector */
2805         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2806         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2807                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2808                          VecCUDAEqualsReverse());
2809       }
2810      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2811       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2812         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2813         nx = mat->num_rows;
2814         ny = mat->num_cols;
2815       }
2816      #endif
2817     }
2818 
2819     /* csr_spmv does y = alpha op(A) x + beta y */
2820     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2821      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2822       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2823       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2824         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2825         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2826         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2827                                 matstruct->matDescr,
2828                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2829                                 matstruct->cuSpMV[opA].vecYDescr,
2830                                 cusparse_scalartype,
2831                                 cusparsestruct->spmvAlg,
2832                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2833         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2834 
2835         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2836       } else {
2837         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2838         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2839         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2840       }
2841 
2842       stat = cusparseSpMV(cusparsestruct->handle, opA,
2843                                matstruct->alpha_one,
2844                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2845                                matstruct->cuSpMV[opA].vecXDescr,
2846                                beta,
2847                                matstruct->cuSpMV[opA].vecYDescr,
2848                                cusparse_scalartype,
2849                                cusparsestruct->spmvAlg,
2850                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2851      #else
2852       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2853       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2854                                mat->num_rows, mat->num_cols,
2855                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2856                                mat->values->data().get(), mat->row_offsets->data().get(),
2857                                mat->column_indices->data().get(), xptr, beta,
2858                                dptr);CHKERRCUSPARSE(stat);
2859      #endif
2860     } else {
2861       if (cusparsestruct->nrows) {
2862        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2863         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2864        #else
2865         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2866         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2867                                  matstruct->alpha_one, matstruct->descr, hybMat,
2868                                  xptr, beta,
2869                                  dptr);CHKERRCUSPARSE(stat);
2870        #endif
2871       }
2872     }
2873     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2874     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2875 
2876     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2877       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2878         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2879           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2880         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2881           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2882         }
2883       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2884         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2885       }
2886 
2887       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2888       if (compressed) {
2889         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2890         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2891         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2892                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2893                          VecCUDAPlusEquals());
2894         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2895         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2896       }
2897     } else {
2898       if (yy && yy != zz) {
2899         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2900       }
2901     }
2902     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2903     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2904     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2905   } catch(char *ex) {
2906     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2907   }
2908   if (yy) {
2909     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2910   } else {
2911     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2912   }
2913   PetscFunctionReturn(0);
2914 }
2915 
2916 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2917 {
2918   PetscErrorCode ierr;
2919 
2920   PetscFunctionBegin;
2921   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2922   PetscFunctionReturn(0);
2923 }
2924 
2925 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2926 {
2927   PetscErrorCode              ierr;
2928   PetscSplitCSRDataStructure  *d_mat = NULL;
2929   PetscFunctionBegin;
2930   if (A->factortype == MAT_FACTOR_NONE) {
2931     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2932   }
2933   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2934   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2935   if (d_mat) {
2936     A->offloadmask = PETSC_OFFLOAD_GPU;
2937   }
2938 
2939   PetscFunctionReturn(0);
2940 }
2941 
2942 /* --------------------------------------------------------------------------------*/
2943 /*@
2944    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2945    (the default parallel PETSc format). This matrix will ultimately pushed down
2946    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2947    assembly performance the user should preallocate the matrix storage by setting
2948    the parameter nz (or the array nnz).  By setting these parameters accurately,
2949    performance during matrix assembly can be increased by more than a factor of 50.
2950 
2951    Collective
2952 
2953    Input Parameters:
2954 +  comm - MPI communicator, set to PETSC_COMM_SELF
2955 .  m - number of rows
2956 .  n - number of columns
2957 .  nz - number of nonzeros per row (same for all rows)
2958 -  nnz - array containing the number of nonzeros in the various rows
2959          (possibly different for each row) or NULL
2960 
2961    Output Parameter:
2962 .  A - the matrix
2963 
2964    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2965    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2966    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2967 
2968    Notes:
2969    If nnz is given then nz is ignored
2970 
2971    The AIJ format (also called the Yale sparse matrix format or
2972    compressed row storage), is fully compatible with standard Fortran 77
2973    storage.  That is, the stored row and column indices can begin at
2974    either one (as in Fortran) or zero.  See the users' manual for details.
2975 
2976    Specify the preallocated storage with either nz or nnz (not both).
2977    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2978    allocation.  For large problems you MUST preallocate memory or you
2979    will get TERRIBLE performance, see the users' manual chapter on matrices.
2980 
2981    By default, this format uses inodes (identical nodes) when possible, to
2982    improve numerical efficiency of matrix-vector products and solves. We
2983    search for consecutive rows with the same nonzero structure, thereby
2984    reusing matrix information to achieve increased efficiency.
2985 
2986    Level: intermediate
2987 
2988 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2989 @*/
2990 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2991 {
2992   PetscErrorCode ierr;
2993 
2994   PetscFunctionBegin;
2995   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2996   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2997   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2998   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2999   PetscFunctionReturn(0);
3000 }
3001 
3002 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3003 {
3004   PetscErrorCode              ierr;
3005   PetscSplitCSRDataStructure  *d_mat = NULL;
3006 
3007   PetscFunctionBegin;
3008   if (A->factortype == MAT_FACTOR_NONE) {
3009     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3010     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3011     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3012   } else {
3013     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3014   }
3015   if (d_mat) {
3016     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3017     cudaError_t                err;
3018     PetscSplitCSRDataStructure h_mat;
3019     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3020     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3021     if (a->compressedrow.use) {
3022       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3023     }
3024     err = cudaFree(d_mat);CHKERRCUDA(err);
3025   }
3026   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3027   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3028   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3029   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3030   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3031   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3032   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3033   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3034   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3039 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3040 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3041 {
3042   PetscErrorCode ierr;
3043 
3044   PetscFunctionBegin;
3045   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3046   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3047   PetscFunctionReturn(0);
3048 }
3049 
3050 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3051 {
3052   PetscErrorCode     ierr;
3053   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3054   Mat_SeqAIJCUSPARSE *cy;
3055   Mat_SeqAIJCUSPARSE *cx;
3056   PetscScalar        *ay;
3057   const PetscScalar  *ax;
3058   CsrMatrix          *csry,*csrx;
3059   cudaError_t        cerr;
3060 
3061   PetscFunctionBegin;
3062   if (X->ops->axpy != Y->ops->axpy) {
3063     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3064     PetscFunctionReturn(0);
3065   }
3066   /* if we are here, it means both matrices are bound to GPU */
3067   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3068   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3069   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3070   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3071   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3072   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3073   csry = (CsrMatrix*)cy->mat->mat;
3074   csrx = (CsrMatrix*)cx->mat->mat;
3075   /* see if we can turn this into a cublas axpy */
3076   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3077     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3078     if (eq) {
3079       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3080     }
3081     if (eq) str = SAME_NONZERO_PATTERN;
3082   }
3083 
3084   if (str == SUBSET_NONZERO_PATTERN) {
3085     cusparseStatus_t stat;
3086     PetscScalar      b = 1.0;
3087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3088     size_t           bufferSize;
3089     void             *buffer;
3090 #endif
3091 
3092     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3093     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3094     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3095 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3096     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3097                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3098                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3099                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3100     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3101     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3102     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3103                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3104                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3105                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3106     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3107     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3108     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3109     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3110 #else
3111     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3112     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3113                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3114                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3115                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3116     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3117     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3118     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3119 #endif
3120     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3121     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3122     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3123     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3124   } else if (str == SAME_NONZERO_PATTERN) {
3125     cublasHandle_t cublasv2handle;
3126     cublasStatus_t berr;
3127     PetscBLASInt   one = 1, bnz = 1;
3128 
3129     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3130     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3131     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3132     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3133     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3134     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3135     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3136     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3137     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3138     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3139     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3140     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3141   } else {
3142     ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
3143   }
3144   PetscFunctionReturn(0);
3145 }
3146 
3147 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3148 {
3149   PetscErrorCode ierr;
3150   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3151   PetscScalar    *ay;
3152   cudaError_t    cerr;
3153   cublasHandle_t cublasv2handle;
3154   cublasStatus_t berr;
3155   PetscBLASInt   one = 1, bnz = 1;
3156 
3157   PetscFunctionBegin;
3158   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3159   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3160   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3161   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3162   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3163   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3164   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3165   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3166   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3167   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3168   PetscFunctionReturn(0);
3169 }
3170 
3171 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3172 {
3173   PetscErrorCode             ierr;
3174   PetscBool                  both = PETSC_FALSE;
3175   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3176 
3177   PetscFunctionBegin;
3178   if (A->factortype == MAT_FACTOR_NONE) {
3179     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3180     if (spptr->mat) {
3181       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3182       if (matrix->values) {
3183         both = PETSC_TRUE;
3184         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3185       }
3186     }
3187     if (spptr->matTranspose) {
3188       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3189       if (matrix->values) {
3190         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3191       }
3192     }
3193   }
3194   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3195   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3196   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3197   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3198   else A->offloadmask = PETSC_OFFLOAD_CPU;
3199 
3200   PetscFunctionReturn(0);
3201 }
3202 
3203 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3204 {
3205   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3206   PetscErrorCode ierr;
3207 
3208   PetscFunctionBegin;
3209   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3210   if (flg) {
3211     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3212 
3213     A->ops->scale                     = MatScale_SeqAIJ;
3214     A->ops->axpy                      = MatAXPY_SeqAIJ;
3215     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3216     A->ops->mult                      = MatMult_SeqAIJ;
3217     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3218     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3219     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3220     A->ops->multhermitiantranspose    = NULL;
3221     A->ops->multhermitiantransposeadd = NULL;
3222     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3223     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3224     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3225     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3226     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3227     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3228     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3229     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3230   } else {
3231     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3232     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3233     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3234     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3235     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3236     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3237     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3238     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3239     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3240     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3241     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3242     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3243     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3244     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3245     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3246     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3247     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3248   }
3249   A->boundtocpu = flg;
3250   a->inode.use = flg;
3251   PetscFunctionReturn(0);
3252 }
3253 
3254 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3255 {
3256   PetscErrorCode   ierr;
3257   cusparseStatus_t stat;
3258   Mat              B;
3259 
3260   PetscFunctionBegin;
3261   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3262   if (reuse == MAT_INITIAL_MATRIX) {
3263     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3264   } else if (reuse == MAT_REUSE_MATRIX) {
3265     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3266   }
3267   B = *newmat;
3268 
3269   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3270   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3271 
3272   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3273     if (B->factortype == MAT_FACTOR_NONE) {
3274       Mat_SeqAIJCUSPARSE *spptr;
3275 
3276       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3277       spptr->format = MAT_CUSPARSE_CSR;
3278       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3279       B->spptr = spptr;
3280       spptr->deviceMat = NULL;
3281     } else {
3282       Mat_SeqAIJCUSPARSETriFactors *spptr;
3283 
3284       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3285       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3286       B->spptr = spptr;
3287     }
3288     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3289   }
3290   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3291   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3292   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3293   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3294   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3295 
3296   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3297   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3298   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3299   PetscFunctionReturn(0);
3300 }
3301 
3302 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3303 {
3304   PetscErrorCode ierr;
3305 
3306   PetscFunctionBegin;
3307   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3308   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3309   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3310   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3311   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3312   PetscFunctionReturn(0);
3313 }
3314 
3315 /*MC
3316    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3317 
3318    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3319    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3320    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3321 
3322    Options Database Keys:
3323 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3324 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3325 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3326 
3327   Level: beginner
3328 
3329 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3330 M*/
3331 
3332 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3333 
3334 
3335 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3336 {
3337   PetscErrorCode ierr;
3338 
3339   PetscFunctionBegin;
3340   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3341   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3342   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3343   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3344   PetscFunctionReturn(0);
3345 }
3346 
3347 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3348 {
3349   PetscErrorCode   ierr;
3350   cusparseStatus_t stat;
3351 
3352   PetscFunctionBegin;
3353   if (*cusparsestruct) {
3354     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3355     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3356     delete (*cusparsestruct)->workVector;
3357     delete (*cusparsestruct)->rowoffsets_gpu;
3358     delete (*cusparsestruct)->cooPerm;
3359     delete (*cusparsestruct)->cooPerm_a;
3360     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3361    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3362     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3363    #endif
3364     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3365   }
3366   PetscFunctionReturn(0);
3367 }
3368 
3369 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3370 {
3371   PetscFunctionBegin;
3372   if (*mat) {
3373     delete (*mat)->values;
3374     delete (*mat)->column_indices;
3375     delete (*mat)->row_offsets;
3376     delete *mat;
3377     *mat = 0;
3378   }
3379   PetscFunctionReturn(0);
3380 }
3381 
3382 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3383 {
3384   cusparseStatus_t stat;
3385   PetscErrorCode   ierr;
3386 
3387   PetscFunctionBegin;
3388   if (*trifactor) {
3389     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3390     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3391     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3392     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3393     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3394    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3395     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3396    #endif
3397     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3398   }
3399   PetscFunctionReturn(0);
3400 }
3401 
3402 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3403 {
3404   CsrMatrix        *mat;
3405   cusparseStatus_t stat;
3406   cudaError_t      err;
3407 
3408   PetscFunctionBegin;
3409   if (*matstruct) {
3410     if ((*matstruct)->mat) {
3411       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3412        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3413         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3414        #else
3415         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3416         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3417        #endif
3418       } else {
3419         mat = (CsrMatrix*)(*matstruct)->mat;
3420         CsrMatrix_Destroy(&mat);
3421       }
3422     }
3423     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3424     delete (*matstruct)->cprowIndices;
3425     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3426     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3427     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3428 
3429    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3430     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3431     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3432     for (int i=0; i<3; i++) {
3433       if (mdata->cuSpMV[i].initialized) {
3434         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3435         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3436         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3437       }
3438     }
3439    #endif
3440     delete *matstruct;
3441     *matstruct = NULL;
3442   }
3443   PetscFunctionReturn(0);
3444 }
3445 
3446 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3447 {
3448   PetscErrorCode ierr;
3449 
3450   PetscFunctionBegin;
3451   if (*trifactors) {
3452     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3453     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3454     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3455     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3456     delete (*trifactors)->rpermIndices;
3457     delete (*trifactors)->cpermIndices;
3458     delete (*trifactors)->workVector;
3459     (*trifactors)->rpermIndices = NULL;
3460     (*trifactors)->cpermIndices = NULL;
3461     (*trifactors)->workVector = NULL;
3462   }
3463   PetscFunctionReturn(0);
3464 }
3465 
3466 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3467 {
3468   PetscErrorCode   ierr;
3469   cusparseHandle_t handle;
3470   cusparseStatus_t stat;
3471 
3472   PetscFunctionBegin;
3473   if (*trifactors) {
3474     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3475     if (handle = (*trifactors)->handle) {
3476       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3477     }
3478     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3479   }
3480   PetscFunctionReturn(0);
3481 }
3482 
3483 struct IJCompare
3484 {
3485   __host__ __device__
3486   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3487   {
3488     if (t1.get<0>() < t2.get<0>()) return true;
3489     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3490     return false;
3491   }
3492 };
3493 
3494 struct IJEqual
3495 {
3496   __host__ __device__
3497   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3498   {
3499     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3500     return true;
3501   }
3502 };
3503 
3504 struct IJDiff
3505 {
3506   __host__ __device__
3507   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3508   {
3509     return t1 == t2 ? 0 : 1;
3510   }
3511 };
3512 
3513 struct IJSum
3514 {
3515   __host__ __device__
3516   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3517   {
3518     return t1||t2;
3519   }
3520 };
3521 
3522 #include <thrust/iterator/discard_iterator.h>
3523 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3524 {
3525   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3526   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3527   THRUSTARRAY                           *cooPerm_v = NULL;
3528   thrust::device_ptr<const PetscScalar> d_v;
3529   CsrMatrix                             *matrix;
3530   PetscErrorCode                        ierr;
3531   cudaError_t                           cerr;
3532   PetscInt                              n;
3533 
3534   PetscFunctionBegin;
3535   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3536   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3537   if (!cusp->cooPerm) {
3538     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3539     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3540     PetscFunctionReturn(0);
3541   }
3542   matrix = (CsrMatrix*)cusp->mat->mat;
3543   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3544   if (!v) {
3545     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3546     goto finalize;
3547   }
3548   n = cusp->cooPerm->size();
3549   if (isCudaMem(v)) {
3550     d_v = thrust::device_pointer_cast(v);
3551   } else {
3552     cooPerm_v = new THRUSTARRAY(n);
3553     cooPerm_v->assign(v,v+n);
3554     d_v = cooPerm_v->data();
3555     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3556   }
3557   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3558   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3559     if (cusp->cooPerm_a) {
3560       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3561       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3562       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3563       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3564       delete cooPerm_w;
3565     } else {
3566       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3567                                                                 matrix->values->begin()));
3568       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3569                                                                 matrix->values->end()));
3570       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3571     }
3572   } else {
3573     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3574       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3575       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3576     } else {
3577       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3578                                                                 matrix->values->begin()));
3579       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3580                                                                 matrix->values->end()));
3581       thrust::for_each(zibit,zieit,VecCUDAEquals());
3582     }
3583   }
3584   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3585   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3586 finalize:
3587   delete cooPerm_v;
3588   A->offloadmask = PETSC_OFFLOAD_GPU;
3589   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3590   /* shorter version of MatAssemblyEnd_SeqAIJ */
3591   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3592   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3593   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3594   a->reallocs         = 0;
3595   A->info.mallocs    += 0;
3596   A->info.nz_unneeded = 0;
3597   A->assembled = A->was_assembled = PETSC_TRUE;
3598   A->num_ass++;
3599   PetscFunctionReturn(0);
3600 }
3601 
3602 #include <thrust/binary_search.h>
3603 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3604 {
3605   PetscErrorCode     ierr;
3606   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3607   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3608   PetscInt           cooPerm_n, nzr = 0;
3609   cudaError_t        cerr;
3610 
3611   PetscFunctionBegin;
3612   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3613   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3614   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3615   if (n != cooPerm_n) {
3616     delete cusp->cooPerm;
3617     delete cusp->cooPerm_a;
3618     cusp->cooPerm = NULL;
3619     cusp->cooPerm_a = NULL;
3620   }
3621   if (n) {
3622     THRUSTINTARRAY d_i(n);
3623     THRUSTINTARRAY d_j(n);
3624     THRUSTINTARRAY ii(A->rmap->n);
3625 
3626     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3627     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3628 
3629     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3630     d_i.assign(coo_i,coo_i+n);
3631     d_j.assign(coo_j,coo_j+n);
3632     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3633     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3634 
3635     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3636     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3637     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3638     *cusp->cooPerm_a = d_i;
3639     THRUSTINTARRAY w = d_j;
3640 
3641     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3642     if (nekey == ekey) { /* all entries are unique */
3643       delete cusp->cooPerm_a;
3644       cusp->cooPerm_a = NULL;
3645     } else { /* I couldn't come up with a more elegant algorithm */
3646       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3647       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3648       (*cusp->cooPerm_a)[0] = 0;
3649       w[0] = 0;
3650       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3651       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3652     }
3653     thrust::counting_iterator<PetscInt> search_begin(0);
3654     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3655                         search_begin, search_begin + A->rmap->n,
3656                         ii.begin());
3657     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3658     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3659 
3660     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3661     a->singlemalloc = PETSC_FALSE;
3662     a->free_a       = PETSC_TRUE;
3663     a->free_ij      = PETSC_TRUE;
3664     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3665     a->i[0] = 0;
3666     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3667     a->nz = a->maxnz = a->i[A->rmap->n];
3668     a->rmax = 0;
3669     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3670     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3671     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3672     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3673     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3674     for (PetscInt i = 0; i < A->rmap->n; i++) {
3675       const PetscInt nnzr = a->i[i+1] - a->i[i];
3676       nzr += (PetscInt)!!(nnzr);
3677       a->ilen[i] = a->imax[i] = nnzr;
3678       a->rmax = PetscMax(a->rmax,nnzr);
3679     }
3680     a->nonzerorowcnt = nzr;
3681     A->preallocated = PETSC_TRUE;
3682     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3683     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3684   } else {
3685     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3686   }
3687   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3688 
3689   /* We want to allocate the CUSPARSE struct for matvec now.
3690      The code is so convoluted now that I prefer to copy zeros */
3691   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3692   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3693   A->offloadmask = PETSC_OFFLOAD_CPU;
3694   A->nonzerostate++;
3695   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3696   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3697 
3698   A->assembled = PETSC_FALSE;
3699   A->was_assembled = PETSC_FALSE;
3700   PetscFunctionReturn(0);
3701 }
3702 
3703 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3704 {
3705   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3706   CsrMatrix          *csr;
3707   PetscErrorCode     ierr;
3708 
3709   PetscFunctionBegin;
3710   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3711   PetscValidPointer(a,2);
3712   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3713   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3714   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3715   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3716   csr = (CsrMatrix*)cusp->mat->mat;
3717   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3718   *a = csr->values->data().get();
3719   PetscFunctionReturn(0);
3720 }
3721 
3722 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3723 {
3724   PetscFunctionBegin;
3725   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3726   PetscValidPointer(a,2);
3727   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3728   *a = NULL;
3729   PetscFunctionReturn(0);
3730 }
3731 
3732 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3733 {
3734   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3735   CsrMatrix          *csr;
3736   PetscErrorCode     ierr;
3737 
3738   PetscFunctionBegin;
3739   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3740   PetscValidPointer(a,2);
3741   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3742   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3743   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3744   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3745   csr = (CsrMatrix*)cusp->mat->mat;
3746   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3747   *a = csr->values->data().get();
3748   A->offloadmask = PETSC_OFFLOAD_GPU;
3749   PetscFunctionReturn(0);
3750 }
3751 
3752 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3753 {
3754   PetscErrorCode ierr;
3755 
3756   PetscFunctionBegin;
3757   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3758   PetscValidPointer(a,2);
3759   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3760   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3761   *a = NULL;
3762   PetscFunctionReturn(0);
3763 }
3764 
3765 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3766 {
3767   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3768   CsrMatrix          *csr;
3769 
3770   PetscFunctionBegin;
3771   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3772   PetscValidPointer(a,2);
3773   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3774   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3775   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3776   csr = (CsrMatrix*)cusp->mat->mat;
3777   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3778   *a = csr->values->data().get();
3779   A->offloadmask = PETSC_OFFLOAD_GPU;
3780   PetscFunctionReturn(0);
3781 }
3782 
3783 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3784 {
3785   PetscErrorCode ierr;
3786 
3787   PetscFunctionBegin;
3788   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3789   PetscValidPointer(a,2);
3790   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3791   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3792   *a = NULL;
3793   PetscFunctionReturn(0);
3794 }
3795 
3796 struct IJCompare4
3797 {
3798   __host__ __device__
3799   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3800   {
3801     if (t1.get<0>() < t2.get<0>()) return true;
3802     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3803     return false;
3804   }
3805 };
3806 
3807 struct Shift
3808 {
3809   int _shift;
3810 
3811   Shift(int shift) : _shift(shift) {}
3812   __host__ __device__
3813   inline int operator() (const int &c)
3814   {
3815     return c + _shift;
3816   }
3817 };
3818 
3819 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3820 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3821 {
3822   PetscErrorCode               ierr;
3823   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3824   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3825   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3826   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3827   PetscInt                     Annz,Bnnz;
3828   cusparseStatus_t             stat;
3829   PetscInt                     i,m,n,zero = 0;
3830   cudaError_t                  cerr;
3831 
3832   PetscFunctionBegin;
3833   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3834   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3835   PetscValidPointer(C,4);
3836   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3837   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3838   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3839   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3840   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3841   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3842   if (reuse == MAT_INITIAL_MATRIX) {
3843     m     = A->rmap->n;
3844     n     = A->cmap->n + B->cmap->n;
3845     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3846     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3847     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3848     c     = (Mat_SeqAIJ*)(*C)->data;
3849     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3850     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3851     Ccsr  = new CsrMatrix;
3852     Cmat->cprowIndices      = NULL;
3853     c->compressedrow.use    = PETSC_FALSE;
3854     c->compressedrow.nrows  = 0;
3855     c->compressedrow.i      = NULL;
3856     c->compressedrow.rindex = NULL;
3857     Ccusp->workVector       = NULL;
3858     Ccusp->nrows    = m;
3859     Ccusp->mat      = Cmat;
3860     Ccusp->mat->mat = Ccsr;
3861     Ccsr->num_rows  = m;
3862     Ccsr->num_cols  = n;
3863     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3864     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3865     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3866     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3867     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3868     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3869     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3870     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3871     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3872     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3873     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3874     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3875     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3876     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3877     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3878 
3879     Acsr = (CsrMatrix*)Acusp->mat->mat;
3880     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3881     Annz = (PetscInt)Acsr->column_indices->size();
3882     Bnnz = (PetscInt)Bcsr->column_indices->size();
3883     c->nz = Annz + Bnnz;
3884     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3885     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3886     Ccsr->values = new THRUSTARRAY(c->nz);
3887     Ccsr->num_entries = c->nz;
3888     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3889     if (c->nz) {
3890       auto Acoo = new THRUSTINTARRAY32(Annz);
3891       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3892       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3893       THRUSTINTARRAY32 *Aroff,*Broff;
3894 
3895       if (a->compressedrow.use) { /* need full row offset */
3896         if (!Acusp->rowoffsets_gpu) {
3897           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3898           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3899           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3900         }
3901         Aroff = Acusp->rowoffsets_gpu;
3902       } else Aroff = Acsr->row_offsets;
3903       if (b->compressedrow.use) { /* need full row offset */
3904         if (!Bcusp->rowoffsets_gpu) {
3905           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3906           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3907           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3908         }
3909         Broff = Bcusp->rowoffsets_gpu;
3910       } else Broff = Bcsr->row_offsets;
3911       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3912       stat = cusparseXcsr2coo(Acusp->handle,
3913                               Aroff->data().get(),
3914                               Annz,
3915                               m,
3916                               Acoo->data().get(),
3917                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3918       stat = cusparseXcsr2coo(Bcusp->handle,
3919                               Broff->data().get(),
3920                               Bnnz,
3921                               m,
3922                               Bcoo->data().get(),
3923                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3924       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3925       auto Aperm = thrust::make_constant_iterator(1);
3926       auto Bperm = thrust::make_constant_iterator(0);
3927 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3928       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3929       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3930 #else
3931       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3932       auto Bcib = Bcsr->column_indices->begin();
3933       auto Bcie = Bcsr->column_indices->end();
3934       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3935 #endif
3936       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3937       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3938       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3939       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3940       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3941       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3942       auto p1 = Ccusp->cooPerm->begin();
3943       auto p2 = Ccusp->cooPerm->begin();
3944       thrust::advance(p2,Annz);
3945       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3946 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3947       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3948 #endif
3949       auto cci = thrust::make_counting_iterator(zero);
3950       auto cce = thrust::make_counting_iterator(c->nz);
3951 #if 0 //Errors on SUMMIT cuda 11.1.0
3952       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3953 #else
3954       auto pred = thrust::identity<int>();
3955       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
3956       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
3957 #endif
3958       stat = cusparseXcoo2csr(Ccusp->handle,
3959                               Ccoo->data().get(),
3960                               c->nz,
3961                               m,
3962                               Ccsr->row_offsets->data().get(),
3963                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3964       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3965       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3966       delete wPerm;
3967       delete Acoo;
3968       delete Bcoo;
3969       delete Ccoo;
3970 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3971       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3972                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3973                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3974                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3975 #endif
3976       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3977         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3978         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3979         CsrMatrix *CcsrT = new CsrMatrix;
3980         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3981         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3982 
3983         Ccusp->transgen = PETSC_TRUE;
3984         CmatT->cprowIndices  = NULL;
3985         CmatT->mat = CcsrT;
3986         CcsrT->num_rows = n;
3987         CcsrT->num_cols = m;
3988         CcsrT->num_entries = c->nz;
3989 
3990         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3991         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3992         CcsrT->values = new THRUSTARRAY(c->nz);
3993 
3994         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3995         auto rT = CcsrT->row_offsets->begin();
3996         if (AT) {
3997           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3998           thrust::advance(rT,-1);
3999         }
4000         if (BT) {
4001           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4002           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4003           thrust::copy(titb,tite,rT);
4004         }
4005         auto cT = CcsrT->column_indices->begin();
4006         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4007         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4008         auto vT = CcsrT->values->begin();
4009         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4010         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4011         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4012         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4013 
4014         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4015         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4016         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4017         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4018         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4019         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4020         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4021         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4022         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4023 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4024         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4025                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4026                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4027                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4028 #endif
4029         Ccusp->matTranspose = CmatT;
4030       }
4031     }
4032 
4033     c->singlemalloc = PETSC_FALSE;
4034     c->free_a       = PETSC_TRUE;
4035     c->free_ij      = PETSC_TRUE;
4036     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4037     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4038     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4039       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4040       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4041       ii   = *Ccsr->row_offsets;
4042       jj   = *Ccsr->column_indices;
4043       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4044       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4045     } else {
4046       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4047       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4048     }
4049     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4050     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4051     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4052     c->maxnz = c->nz;
4053     c->nonzerorowcnt = 0;
4054     c->rmax = 0;
4055     for (i = 0; i < m; i++) {
4056       const PetscInt nn = c->i[i+1] - c->i[i];
4057       c->ilen[i] = c->imax[i] = nn;
4058       c->nonzerorowcnt += (PetscInt)!!nn;
4059       c->rmax = PetscMax(c->rmax,nn);
4060     }
4061     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4062     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4063     (*C)->nonzerostate++;
4064     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4065     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4066     Ccusp->nonzerostate = (*C)->nonzerostate;
4067     (*C)->preallocated  = PETSC_TRUE;
4068   } else {
4069     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4070     c = (Mat_SeqAIJ*)(*C)->data;
4071     if (c->nz) {
4072       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4073       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4074       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4075       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4076       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4077       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4078       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4079       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4080       Acsr = (CsrMatrix*)Acusp->mat->mat;
4081       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4082       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4083       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4084       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4085       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4086       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4087       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4088       auto pmid = Ccusp->cooPerm->begin();
4089       thrust::advance(pmid,Acsr->num_entries);
4090       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4091       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4092                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4093       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4094                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4095       thrust::for_each(zibait,zieait,VecCUDAEquals());
4096       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4097                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4098       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4099                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4100       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4101       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4102         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4103         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4104         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4105         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4106         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4107         auto vT = CcsrT->values->begin();
4108         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4109         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4110       }
4111       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4112       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4113     }
4114   }
4115   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4116   (*C)->assembled     = PETSC_TRUE;
4117   (*C)->was_assembled = PETSC_FALSE;
4118   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4119   PetscFunctionReturn(0);
4120 }
4121 
4122 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4123 {
4124   PetscErrorCode    ierr;
4125   bool              dmem;
4126   const PetscScalar *av;
4127   cudaError_t       cerr;
4128 
4129   PetscFunctionBegin;
4130   dmem = isCudaMem(v);
4131   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4132   if (n && idx) {
4133     THRUSTINTARRAY widx(n);
4134     widx.assign(idx,idx+n);
4135     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4136 
4137     THRUSTARRAY *w = NULL;
4138     thrust::device_ptr<PetscScalar> dv;
4139     if (dmem) {
4140       dv = thrust::device_pointer_cast(v);
4141     } else {
4142       w = new THRUSTARRAY(n);
4143       dv = w->data();
4144     }
4145     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4146 
4147     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4148     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4149     thrust::for_each(zibit,zieit,VecCUDAEquals());
4150     if (w) {
4151       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4152     }
4153     delete w;
4154   } else {
4155     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4156   }
4157   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4158   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4159   PetscFunctionReturn(0);
4160 }
4161