xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6f3d89d0b697a3d79aa40878724d2c791c95edac)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 
86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
88 
89 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
90 
91 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
92 {
93   cusparseStatus_t   stat;
94   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
95 
96   PetscFunctionBegin;
97   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
98   cusparsestruct->stream = stream;
99   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
100   PetscFunctionReturn(0);
101 }
102 
103 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
104 {
105   cusparseStatus_t   stat;
106   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
107 
108   PetscFunctionBegin;
109   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
110   if (cusparsestruct->handle != handle) {
111     if (cusparsestruct->handle) {
112       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
113     }
114     cusparsestruct->handle = handle;
115   }
116   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
117   PetscFunctionReturn(0);
118 }
119 
120 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
121 {
122   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
123   PetscBool          flg;
124   PetscErrorCode     ierr;
125 
126   PetscFunctionBegin;
127   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
128   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
129   if (cusparsestruct->handle) cusparsestruct->handle = 0;
130   PetscFunctionReturn(0);
131 }
132 
133 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
134 {
135   PetscFunctionBegin;
136   *type = MATSOLVERCUSPARSE;
137   PetscFunctionReturn(0);
138 }
139 
140 /*MC
141   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
142   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
143   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
144   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
145   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
146   algorithms are not recommended. This class does NOT support direct solver operations.
147 
148   Level: beginner
149 
150 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
151 M*/
152 
153 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
154 {
155   PetscErrorCode ierr;
156   PetscInt       n = A->rmap->n;
157 
158   PetscFunctionBegin;
159   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
160   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
161   (*B)->factortype = ftype;
162   (*B)->useordering = PETSC_TRUE;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
170     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
171     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
172   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
173 
174   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
175   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
180 {
181   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
182 
183   PetscFunctionBegin;
184   switch (op) {
185   case MAT_CUSPARSE_MULT:
186     cusparsestruct->format = format;
187     break;
188   case MAT_CUSPARSE_ALL:
189     cusparsestruct->format = format;
190     break;
191   default:
192     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
193   }
194   PetscFunctionReturn(0);
195 }
196 
197 /*@
198    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
199    operation. Only the MatMult operation can use different GPU storage formats
200    for MPIAIJCUSPARSE matrices.
201    Not Collective
202 
203    Input Parameters:
204 +  A - Matrix of type SEQAIJCUSPARSE
205 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
206 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
207 
208    Output Parameter:
209 
210    Level: intermediate
211 
212 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
213 @*/
214 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
220   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
221   PetscFunctionReturn(0);
222 }
223 
224 /*@
225    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
226 
227    Collective on mat
228 
229    Input Parameters:
230 +  A - Matrix of type SEQAIJCUSPARSE
231 -  transgen - the boolean flag
232 
233    Level: intermediate
234 
235 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
236 @*/
237 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
238 {
239   PetscErrorCode ierr;
240   PetscBool      flg;
241 
242   PetscFunctionBegin;
243   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
244   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
245   if (flg) {
246     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
247 
248     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
249     cusp->transgen = transgen;
250     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
251       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
252     }
253   }
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
258 {
259   PetscErrorCode           ierr;
260   MatCUSPARSEStorageFormat format;
261   PetscBool                flg;
262   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
263 
264   PetscFunctionBegin;
265   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
266   if (A->factortype == MAT_FACTOR_NONE) {
267     PetscBool transgen = cusparsestruct->transgen;
268 
269     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
270     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
271 
272     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
273                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
274     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
275 
276     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
277                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
278     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
279    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
280     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
281     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
282                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
283     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
284     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
285 
286     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
287     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
288                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
289     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
290 
291     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
292     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
293                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
294     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
295    #endif
296   }
297   ierr = PetscOptionsTail();CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
302 {
303   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
304   PetscErrorCode               ierr;
305 
306   PetscFunctionBegin;
307   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
308   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
309   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
310   PetscFunctionReturn(0);
311 }
312 
313 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
314 {
315   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
316   PetscErrorCode               ierr;
317 
318   PetscFunctionBegin;
319   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
320   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
321   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326 {
327   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
328   PetscErrorCode               ierr;
329 
330   PetscFunctionBegin;
331   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
332   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
333   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
338 {
339   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
340   PetscErrorCode               ierr;
341 
342   PetscFunctionBegin;
343   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
344   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
345   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
346   PetscFunctionReturn(0);
347 }
348 
349 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
350 {
351   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
352   PetscInt                          n = A->rmap->n;
353   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
354   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
355   cusparseStatus_t                  stat;
356   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
357   const MatScalar                   *aa = a->a,*v;
358   PetscInt                          *AiLo, *AjLo;
359   PetscInt                          i,nz, nzLower, offset, rowOffset;
360   PetscErrorCode                    ierr;
361   cudaError_t                       cerr;
362 
363   PetscFunctionBegin;
364   if (!n) PetscFunctionReturn(0);
365   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
366     try {
367       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
368       nzLower=n+ai[n]-ai[1];
369       if (!loTriFactor) {
370         PetscScalar                       *AALo;
371 
372         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
373 
374         /* Allocate Space for the lower triangular matrix */
375         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
376         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
377 
378         /* Fill the lower triangular matrix */
379         AiLo[0]  = (PetscInt) 0;
380         AiLo[n]  = nzLower;
381         AjLo[0]  = (PetscInt) 0;
382         AALo[0]  = (MatScalar) 1.0;
383         v        = aa;
384         vi       = aj;
385         offset   = 1;
386         rowOffset= 1;
387         for (i=1; i<n; i++) {
388           nz = ai[i+1] - ai[i];
389           /* additional 1 for the term on the diagonal */
390           AiLo[i]    = rowOffset;
391           rowOffset += nz+1;
392 
393           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
394           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
395 
396           offset      += nz;
397           AjLo[offset] = (PetscInt) i;
398           AALo[offset] = (MatScalar) 1.0;
399           offset      += 1;
400 
401           v  += nz;
402           vi += nz;
403         }
404 
405         /* allocate space for the triangular factor information */
406         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
407         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
408         /* Create the matrix description */
409         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
410         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
411        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
413        #else
414         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
415        #endif
416         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
417         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
418 
419         /* set the operation */
420         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421 
422         /* set the matrix */
423         loTriFactor->csrMat = new CsrMatrix;
424         loTriFactor->csrMat->num_rows = n;
425         loTriFactor->csrMat->num_cols = n;
426         loTriFactor->csrMat->num_entries = nzLower;
427 
428         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
430 
431         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
432         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
433 
434         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
435         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
436 
437         /* Create the solve analysis information */
438         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
439         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
440       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
441         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
442                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
443                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
444                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
445                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
446         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
447       #endif
448 
449         /* perform the solve analysis */
450         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
451                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
452                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
453                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
454                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
455                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
456                                #endif
457 );CHKERRCUSPARSE(stat);
458         cerr = WaitForCUDA();CHKERRCUDA(cerr);
459         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
460 
461         /* assign the pointer */
462         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
463         loTriFactor->AA_h = AALo;
464         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
465         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
466         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
467       } else { /* update values only */
468         if (!loTriFactor->AA_h) {
469           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
470         }
471         /* Fill the lower triangular matrix */
472         loTriFactor->AA_h[0]  = 1.0;
473         v        = aa;
474         vi       = aj;
475         offset   = 1;
476         for (i=1; i<n; i++) {
477           nz = ai[i+1] - ai[i];
478           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
479           offset      += nz;
480           loTriFactor->AA_h[offset] = 1.0;
481           offset      += 1;
482           v  += nz;
483         }
484         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
485         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
486       }
487     } catch(char *ex) {
488       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
489     }
490   }
491   PetscFunctionReturn(0);
492 }
493 
494 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
495 {
496   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
497   PetscInt                          n = A->rmap->n;
498   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
499   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
500   cusparseStatus_t                  stat;
501   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
502   const MatScalar                   *aa = a->a,*v;
503   PetscInt                          *AiUp, *AjUp;
504   PetscInt                          i,nz, nzUpper, offset;
505   PetscErrorCode                    ierr;
506   cudaError_t                       cerr;
507 
508   PetscFunctionBegin;
509   if (!n) PetscFunctionReturn(0);
510   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
511     try {
512       /* next, figure out the number of nonzeros in the upper triangular matrix. */
513       nzUpper = adiag[0]-adiag[n];
514       if (!upTriFactor) {
515         PetscScalar *AAUp;
516 
517         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
518 
519         /* Allocate Space for the upper triangular matrix */
520         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
521         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
522 
523         /* Fill the upper triangular matrix */
524         AiUp[0]=(PetscInt) 0;
525         AiUp[n]=nzUpper;
526         offset = nzUpper;
527         for (i=n-1; i>=0; i--) {
528           v  = aa + adiag[i+1] + 1;
529           vi = aj + adiag[i+1] + 1;
530 
531           /* number of elements NOT on the diagonal */
532           nz = adiag[i] - adiag[i+1]-1;
533 
534           /* decrement the offset */
535           offset -= (nz+1);
536 
537           /* first, set the diagonal elements */
538           AjUp[offset] = (PetscInt) i;
539           AAUp[offset] = (MatScalar)1./v[nz];
540           AiUp[i]      = AiUp[i+1] - (nz+1);
541 
542           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
543           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
544         }
545 
546         /* allocate space for the triangular factor information */
547         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
548         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
549 
550         /* Create the matrix description */
551         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
552         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
553        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
555        #else
556         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
557        #endif
558         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
559         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
560 
561         /* set the operation */
562         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
563 
564         /* set the matrix */
565         upTriFactor->csrMat = new CsrMatrix;
566         upTriFactor->csrMat->num_rows = n;
567         upTriFactor->csrMat->num_cols = n;
568         upTriFactor->csrMat->num_entries = nzUpper;
569 
570         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
571         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
572 
573         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
574         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
575 
576         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
577         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
578 
579         /* Create the solve analysis information */
580         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
581         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
582       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
583         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
584                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
585                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
586                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
587                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
588         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
589       #endif
590 
591         /* perform the solve analysis */
592         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
596                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
597                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
598                                #endif
599 );CHKERRCUSPARSE(stat);
600         cerr = WaitForCUDA();CHKERRCUDA(cerr);
601         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602 
603         /* assign the pointer */
604         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
605         upTriFactor->AA_h = AAUp;
606         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
607         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
608         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
609       } else {
610         if (!upTriFactor->AA_h) {
611           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
612         }
613         /* Fill the upper triangular matrix */
614         offset = nzUpper;
615         for (i=n-1; i>=0; i--) {
616           v  = aa + adiag[i+1] + 1;
617 
618           /* number of elements NOT on the diagonal */
619           nz = adiag[i] - adiag[i+1]-1;
620 
621           /* decrement the offset */
622           offset -= (nz+1);
623 
624           /* first, set the diagonal elements */
625           upTriFactor->AA_h[offset] = 1./v[nz];
626           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
627         }
628         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
629         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
630       }
631     } catch(char *ex) {
632       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
633     }
634   }
635   PetscFunctionReturn(0);
636 }
637 
638 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
639 {
640   PetscErrorCode               ierr;
641   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
642   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
643   IS                           isrow = a->row,iscol = a->icol;
644   PetscBool                    row_identity,col_identity;
645   PetscInt                     n = A->rmap->n;
646 
647   PetscFunctionBegin;
648   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
649   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
650   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
651 
652   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
653   cusparseTriFactors->nnz=a->nz;
654 
655   A->offloadmask = PETSC_OFFLOAD_BOTH;
656   /* lower triangular indices */
657   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
658   if (!row_identity && !cusparseTriFactors->rpermIndices) {
659     const PetscInt *r;
660 
661     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
662     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->rpermIndices->assign(r, r+n);
664     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
665     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666   }
667 
668   /* upper triangular indices */
669   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
670   if (!col_identity && !cusparseTriFactors->cpermIndices) {
671     const PetscInt *c;
672 
673     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
674     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->cpermIndices->assign(c, c+n);
676     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
677     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
678   }
679   PetscFunctionReturn(0);
680 }
681 
682 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
683 {
684   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
685   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
686   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
687   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
688   cusparseStatus_t                  stat;
689   PetscErrorCode                    ierr;
690   cudaError_t                       cerr;
691   PetscInt                          *AiUp, *AjUp;
692   PetscScalar                       *AAUp;
693   PetscScalar                       *AALo;
694   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
695   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
696   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
697   const MatScalar                   *aa = b->a,*v;
698 
699   PetscFunctionBegin;
700   if (!n) PetscFunctionReturn(0);
701   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
702     try {
703       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
704       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705       if (!upTriFactor && !loTriFactor) {
706         /* Allocate Space for the upper triangular matrix */
707         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
708         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
709 
710         /* Fill the upper triangular matrix */
711         AiUp[0]=(PetscInt) 0;
712         AiUp[n]=nzUpper;
713         offset = 0;
714         for (i=0; i<n; i++) {
715           /* set the pointers */
716           v  = aa + ai[i];
717           vj = aj + ai[i];
718           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
719 
720           /* first, set the diagonal elements */
721           AjUp[offset] = (PetscInt) i;
722           AAUp[offset] = (MatScalar)1.0/v[nz];
723           AiUp[i]      = offset;
724           AALo[offset] = (MatScalar)1.0/v[nz];
725 
726           offset+=1;
727           if (nz>0) {
728             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
729             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
730             for (j=offset; j<offset+nz; j++) {
731               AAUp[j] = -AAUp[j];
732               AALo[j] = AAUp[j]/v[nz];
733             }
734             offset+=nz;
735           }
736         }
737 
738         /* allocate space for the triangular factor information */
739         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
740         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
741 
742         /* Create the matrix description */
743         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
744         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
745        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
747        #else
748         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
749        #endif
750         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
751         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
752 
753         /* set the matrix */
754         upTriFactor->csrMat = new CsrMatrix;
755         upTriFactor->csrMat->num_rows = A->rmap->n;
756         upTriFactor->csrMat->num_cols = A->cmap->n;
757         upTriFactor->csrMat->num_entries = a->nz;
758 
759         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
760         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
761 
762         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
763         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
764 
765         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
766         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
767 
768         /* set the operation */
769         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
770 
771         /* Create the solve analysis information */
772         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
773         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
774       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
775         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
776                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
777                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
778                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
779                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
780         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
781       #endif
782 
783         /* perform the solve analysis */
784         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
785                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
786                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
787                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
788                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
789                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
790                                 #endif
791 );CHKERRCUSPARSE(stat);
792         cerr = WaitForCUDA();CHKERRCUDA(cerr);
793         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
794 
795         /* assign the pointer */
796         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
797 
798         /* allocate space for the triangular factor information */
799         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
800         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
801 
802         /* Create the matrix description */
803         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
804         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
805        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
807        #else
808         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
809        #endif
810         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
811         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
812 
813         /* set the operation */
814         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
815 
816         /* set the matrix */
817         loTriFactor->csrMat = new CsrMatrix;
818         loTriFactor->csrMat->num_rows = A->rmap->n;
819         loTriFactor->csrMat->num_cols = A->cmap->n;
820         loTriFactor->csrMat->num_entries = a->nz;
821 
822         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
823         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
824 
825         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
826         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
827 
828         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
829         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
830 
831         /* Create the solve analysis information */
832         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
833         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
834       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
835         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
836                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
837                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
839                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
840         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
841       #endif
842 
843         /* perform the solve analysis */
844         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
845                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
846                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
847                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
848                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
849                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
850                                 #endif
851 );CHKERRCUSPARSE(stat);
852         cerr = WaitForCUDA();CHKERRCUDA(cerr);
853         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854 
855         /* assign the pointer */
856         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
857 
858         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
859         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
860         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
861       } else {
862         /* Fill the upper triangular matrix */
863         offset = 0;
864         for (i=0; i<n; i++) {
865           /* set the pointers */
866           v  = aa + ai[i];
867           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
868 
869           /* first, set the diagonal elements */
870           AAUp[offset] = 1.0/v[nz];
871           AALo[offset] = 1.0/v[nz];
872 
873           offset+=1;
874           if (nz>0) {
875             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
876             for (j=offset; j<offset+nz; j++) {
877               AAUp[j] = -AAUp[j];
878               AALo[j] = AAUp[j]/v[nz];
879             }
880             offset+=nz;
881           }
882         }
883         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
884         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
886         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
887         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
888       }
889       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
890       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
891     } catch(char *ex) {
892       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
893     }
894   }
895   PetscFunctionReturn(0);
896 }
897 
898 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
899 {
900   PetscErrorCode               ierr;
901   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
902   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
903   IS                           ip = a->row;
904   PetscBool                    perm_identity;
905   PetscInt                     n = A->rmap->n;
906 
907   PetscFunctionBegin;
908   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
910   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
911   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
912 
913   A->offloadmask = PETSC_OFFLOAD_BOTH;
914 
915   /* lower triangular indices */
916   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
917   if (!perm_identity) {
918     IS             iip;
919     const PetscInt *irip,*rip;
920 
921     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
922     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
923     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
924     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
926     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
927     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
928     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
929     ierr = ISDestroy(&iip);CHKERRQ(ierr);
930     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
931     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
937 {
938   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
939   IS             isrow = b->row,iscol = b->col;
940   PetscBool      row_identity,col_identity;
941   PetscErrorCode ierr;
942 
943   PetscFunctionBegin;
944   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
945   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
946   B->offloadmask = PETSC_OFFLOAD_CPU;
947   /* determine which version of MatSolve needs to be used. */
948   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
949   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
950   if (row_identity && col_identity) {
951     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
952     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
953     B->ops->matsolve = NULL;
954     B->ops->matsolvetranspose = NULL;
955   } else {
956     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
957     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
958     B->ops->matsolve = NULL;
959     B->ops->matsolvetranspose = NULL;
960   }
961 
962   /* get the triangular factors */
963   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
964   PetscFunctionReturn(0);
965 }
966 
967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968 {
969   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970   IS             ip = b->row;
971   PetscBool      perm_identity;
972   PetscErrorCode ierr;
973 
974   PetscFunctionBegin;
975   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977   B->offloadmask = PETSC_OFFLOAD_CPU;
978   /* determine which version of MatSolve needs to be used. */
979   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980   if (perm_identity) {
981     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->matsolve = NULL;
984     B->ops->matsolvetranspose = NULL;
985   } else {
986     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988     B->ops->matsolve = NULL;
989     B->ops->matsolvetranspose = NULL;
990   }
991 
992   /* get the triangular factors */
993   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998 {
999   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004   cusparseStatus_t                  stat;
1005   cusparseIndexBase_t               indexBase;
1006   cusparseMatrixType_t              matrixType;
1007   cusparseFillMode_t                fillMode;
1008   cusparseDiagType_t                diagType;
1009   cudaError_t                       cerr;
1010   PetscErrorCode                    ierr;
1011 
1012   PetscFunctionBegin;
1013   /* allocate space for the transpose of the lower triangular factor */
1014   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016 
1017   /* set the matrix descriptors of the lower triangular factor */
1018   matrixType = cusparseGetMatType(loTriFactor->descr);
1019   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023 
1024   /* Create the matrix description */
1025   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030 
1031   /* set the operation */
1032   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033 
1034   /* allocate GPU space for the CSC of the lower triangular factor*/
1035   loTriFactorT->csrMat = new CsrMatrix;
1036   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042 
1043   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                        loTriFactor->csrMat->values->data().get(),
1048                                        loTriFactor->csrMat->row_offsets->data().get(),
1049                                        loTriFactor->csrMat->column_indices->data().get(),
1050                                        loTriFactorT->csrMat->values->data().get(),
1051                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055 #endif
1056 
1057   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060                           loTriFactor->csrMat->values->data().get(),
1061                           loTriFactor->csrMat->row_offsets->data().get(),
1062                           loTriFactor->csrMat->column_indices->data().get(),
1063                           loTriFactorT->csrMat->values->data().get(),
1064                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068                         #else
1069                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070                           CUSPARSE_ACTION_NUMERIC, indexBase
1071                         #endif
1072 );CHKERRCUSPARSE(stat);
1073   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075 
1076   /* Create the solve analysis information */
1077   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086 #endif
1087 
1088   /* perform the solve analysis */
1089   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095                           #endif
1096 );CHKERRCUSPARSE(stat);
1097   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099 
1100   /* assign the pointer */
1101   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102 
1103   /*********************************************/
1104   /* Now the Transpose of the Upper Tri Factor */
1105   /*********************************************/
1106 
1107   /* allocate space for the transpose of the upper triangular factor */
1108   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110 
1111   /* set the matrix descriptors of the upper triangular factor */
1112   matrixType = cusparseGetMatType(upTriFactor->descr);
1113   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117 
1118   /* Create the matrix description */
1119   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124 
1125   /* set the operation */
1126   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127 
1128   /* allocate GPU space for the CSC of the upper triangular factor*/
1129   upTriFactorT->csrMat = new CsrMatrix;
1130   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136 
1137   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                 upTriFactor->csrMat->values->data().get(),
1142                                 upTriFactor->csrMat->row_offsets->data().get(),
1143                                 upTriFactor->csrMat->column_indices->data().get(),
1144                                 upTriFactorT->csrMat->values->data().get(),
1145                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149 #endif
1150 
1151   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154                           upTriFactor->csrMat->values->data().get(),
1155                           upTriFactor->csrMat->row_offsets->data().get(),
1156                           upTriFactor->csrMat->column_indices->data().get(),
1157                           upTriFactorT->csrMat->values->data().get(),
1158                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162                         #else
1163                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                           CUSPARSE_ACTION_NUMERIC, indexBase
1165                         #endif
1166 );CHKERRCUSPARSE(stat);
1167   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169 
1170   /* Create the solve analysis information */
1171   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180   #endif
1181 
1182   /* perform the solve analysis */
1183   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189                           #endif
1190 );CHKERRCUSPARSE(stat);
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1200 {
1201   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1204   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205   cusparseStatus_t             stat;
1206   cusparseIndexBase_t          indexBase;
1207   cudaError_t                  err;
1208   PetscErrorCode               ierr;
1209 
1210   PetscFunctionBegin;
1211   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1212   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1213   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1214   /* create cusparse matrix */
1215   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1216   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1217   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1218   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1219   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1220 
1221   /* set alpha and beta */
1222   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1223   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1224   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1225   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1226   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1227   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1229 
1230   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1231     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1232     CsrMatrix *matrixT= new CsrMatrix;
1233     matrixT->num_rows = A->cmap->n;
1234     matrixT->num_cols = A->rmap->n;
1235     matrixT->num_entries = a->nz;
1236     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1237     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1238     matrixT->values = new THRUSTARRAY(a->nz);
1239 
1240     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1241     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1242 
1243     /* compute the transpose, i.e. the CSC */
1244    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1246                                   A->cmap->n, matrix->num_entries,
1247                                   matrix->values->data().get(),
1248                                   cusparsestruct->rowoffsets_gpu->data().get(),
1249                                   matrix->column_indices->data().get(),
1250                                   matrixT->values->data().get(),
1251                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1252                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1253                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1254     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1255    #endif
1256 
1257     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1258                             A->cmap->n, matrix->num_entries,
1259                             matrix->values->data().get(),
1260                             cusparsestruct->rowoffsets_gpu->data().get(),
1261                             matrix->column_indices->data().get(),
1262                             matrixT->values->data().get(),
1263                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1264                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1265                             CUSPARSE_ACTION_NUMERIC,indexBase,
1266                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1267                           #else
1268                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1269                             CUSPARSE_ACTION_NUMERIC, indexBase
1270                           #endif
1271 );CHKERRCUSPARSE(stat);
1272     matstructT->mat = matrixT;
1273 
1274    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1275     stat = cusparseCreateCsr(&matstructT->matDescr,
1276                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278                              matrixT->values->data().get(),
1279                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1280                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1281    #endif
1282   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1283    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1284     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1285    #else
1286     CsrMatrix *temp  = new CsrMatrix;
1287     CsrMatrix *tempT = new CsrMatrix;
1288     /* First convert HYB to CSR */
1289     temp->num_rows = A->rmap->n;
1290     temp->num_cols = A->cmap->n;
1291     temp->num_entries = a->nz;
1292     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1294     temp->values = new THRUSTARRAY(a->nz);
1295 
1296     stat = cusparse_hyb2csr(cusparsestruct->handle,
1297                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1298                             temp->values->data().get(),
1299                             temp->row_offsets->data().get(),
1300                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1301 
1302     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303     tempT->num_rows = A->rmap->n;
1304     tempT->num_cols = A->cmap->n;
1305     tempT->num_entries = a->nz;
1306     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1307     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308     tempT->values = new THRUSTARRAY(a->nz);
1309 
1310     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1311                             temp->num_cols, temp->num_entries,
1312                             temp->values->data().get(),
1313                             temp->row_offsets->data().get(),
1314                             temp->column_indices->data().get(),
1315                             tempT->values->data().get(),
1316                             tempT->column_indices->data().get(),
1317                             tempT->row_offsets->data().get(),
1318                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1319 
1320     /* Last, convert CSC to HYB */
1321     cusparseHybMat_t hybMat;
1322     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1323     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1324       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1325     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1326                             matstructT->descr, tempT->values->data().get(),
1327                             tempT->row_offsets->data().get(),
1328                             tempT->column_indices->data().get(),
1329                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1330 
1331     /* assign the pointer */
1332     matstructT->mat = hybMat;
1333     /* delete temporaries */
1334     if (tempT) {
1335       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1336       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1337       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1338       delete (CsrMatrix*) tempT;
1339     }
1340     if (temp) {
1341       if (temp->values) delete (THRUSTARRAY*) temp->values;
1342       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1343       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1344       delete (CsrMatrix*) temp;
1345     }
1346    #endif
1347   }
1348   err  = WaitForCUDA();CHKERRCUDA(err);
1349   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1350   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1351   /* the compressed row indices is not used for matTranspose */
1352   matstructT->cprowIndices = NULL;
1353   /* assign the pointer */
1354   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1359 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1360 {
1361   PetscInt                              n = xx->map->n;
1362   const PetscScalar                     *barray;
1363   PetscScalar                           *xarray;
1364   thrust::device_ptr<const PetscScalar> bGPU;
1365   thrust::device_ptr<PetscScalar>       xGPU;
1366   cusparseStatus_t                      stat;
1367   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1368   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1369   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1370   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1371   PetscErrorCode                        ierr;
1372   cudaError_t                           cerr;
1373 
1374   PetscFunctionBegin;
1375   /* Analyze the matrix and create the transpose ... on the fly */
1376   if (!loTriFactorT && !upTriFactorT) {
1377     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1378     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1379     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1380   }
1381 
1382   /* Get the GPU pointers */
1383   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1384   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1385   xGPU = thrust::device_pointer_cast(xarray);
1386   bGPU = thrust::device_pointer_cast(barray);
1387 
1388   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1389   /* First, reorder with the row permutation */
1390   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1391                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1392                xGPU);
1393 
1394   /* First, solve U */
1395   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1396                         upTriFactorT->csrMat->num_rows,
1397                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1398                         upTriFactorT->csrMat->num_entries,
1399                       #endif
1400                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1401                         upTriFactorT->csrMat->values->data().get(),
1402                         upTriFactorT->csrMat->row_offsets->data().get(),
1403                         upTriFactorT->csrMat->column_indices->data().get(),
1404                         upTriFactorT->solveInfo,
1405                         xarray, tempGPU->data().get()
1406                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1407                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1408                       #endif
1409 );CHKERRCUSPARSE(stat);
1410 
1411   /* Then, solve L */
1412   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1413                         loTriFactorT->csrMat->num_rows,
1414                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1415                         loTriFactorT->csrMat->num_entries,
1416                       #endif
1417                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1418                         loTriFactorT->csrMat->values->data().get(),
1419                         loTriFactorT->csrMat->row_offsets->data().get(),
1420                         loTriFactorT->csrMat->column_indices->data().get(),
1421                         loTriFactorT->solveInfo,
1422                         tempGPU->data().get(), xarray
1423                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1424                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1425                       #endif
1426 );CHKERRCUSPARSE(stat);
1427 
1428   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1429   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1430                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1431                tempGPU->begin());
1432 
1433   /* Copy the temporary to the full solution. */
1434   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1435 
1436   /* restore */
1437   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1438   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1440   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1441   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1446 {
1447   const PetscScalar                 *barray;
1448   PetscScalar                       *xarray;
1449   cusparseStatus_t                  stat;
1450   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1451   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1452   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1453   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1454   PetscErrorCode                    ierr;
1455   cudaError_t                       cerr;
1456 
1457   PetscFunctionBegin;
1458   /* Analyze the matrix and create the transpose ... on the fly */
1459   if (!loTriFactorT && !upTriFactorT) {
1460     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1461     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1462     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1463   }
1464 
1465   /* Get the GPU pointers */
1466   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1467   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1468 
1469   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1470   /* First, solve U */
1471   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1472                         upTriFactorT->csrMat->num_rows,
1473                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1474                         upTriFactorT->csrMat->num_entries,
1475                       #endif
1476                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1477                         upTriFactorT->csrMat->values->data().get(),
1478                         upTriFactorT->csrMat->row_offsets->data().get(),
1479                         upTriFactorT->csrMat->column_indices->data().get(),
1480                         upTriFactorT->solveInfo,
1481                         barray, tempGPU->data().get()
1482                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1483                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1484                       #endif
1485 );CHKERRCUSPARSE(stat);
1486 
1487   /* Then, solve L */
1488   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1489                         loTriFactorT->csrMat->num_rows,
1490                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1491                         loTriFactorT->csrMat->num_entries,
1492                       #endif
1493                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1494                         loTriFactorT->csrMat->values->data().get(),
1495                         loTriFactorT->csrMat->row_offsets->data().get(),
1496                         loTriFactorT->csrMat->column_indices->data().get(),
1497                         loTriFactorT->solveInfo,
1498                         tempGPU->data().get(), xarray
1499                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1500                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1501                       #endif
1502 );CHKERRCUSPARSE(stat);
1503 
1504   /* restore */
1505   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1506   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1507   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1508   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1509   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1514 {
1515   const PetscScalar                     *barray;
1516   PetscScalar                           *xarray;
1517   thrust::device_ptr<const PetscScalar> bGPU;
1518   thrust::device_ptr<PetscScalar>       xGPU;
1519   cusparseStatus_t                      stat;
1520   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1521   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1522   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1523   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1524   PetscErrorCode                        ierr;
1525   cudaError_t                           cerr;
1526 
1527   PetscFunctionBegin;
1528 
1529   /* Get the GPU pointers */
1530   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1531   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1532   xGPU = thrust::device_pointer_cast(xarray);
1533   bGPU = thrust::device_pointer_cast(barray);
1534 
1535   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1536   /* First, reorder with the row permutation */
1537   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1538                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1539                tempGPU->begin());
1540 
1541   /* Next, solve L */
1542   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1543                         loTriFactor->csrMat->num_rows,
1544                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545                         loTriFactor->csrMat->num_entries,
1546                       #endif
1547                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1548                         loTriFactor->csrMat->values->data().get(),
1549                         loTriFactor->csrMat->row_offsets->data().get(),
1550                         loTriFactor->csrMat->column_indices->data().get(),
1551                         loTriFactor->solveInfo,
1552                         tempGPU->data().get(), xarray
1553                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1555                       #endif
1556 );CHKERRCUSPARSE(stat);
1557 
1558   /* Then, solve U */
1559   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1560                         upTriFactor->csrMat->num_rows,
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         upTriFactor->csrMat->num_entries,
1563                       #endif
1564                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1565                         upTriFactor->csrMat->values->data().get(),
1566                         upTriFactor->csrMat->row_offsets->data().get(),
1567                         upTriFactor->csrMat->column_indices->data().get(),
1568                         upTriFactor->solveInfo,
1569                         xarray, tempGPU->data().get()
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1572                       #endif
1573 );CHKERRCUSPARSE(stat);
1574 
1575   /* Last, reorder with the column permutation */
1576   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1577                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1578                xGPU);
1579 
1580   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1581   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1582   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1583   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1584   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1589 {
1590   const PetscScalar                 *barray;
1591   PetscScalar                       *xarray;
1592   cusparseStatus_t                  stat;
1593   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1594   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1595   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1596   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1597   PetscErrorCode                    ierr;
1598   cudaError_t                       cerr;
1599 
1600   PetscFunctionBegin;
1601   /* Get the GPU pointers */
1602   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1603   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1604 
1605   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1606   /* First, solve L */
1607   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1608                         loTriFactor->csrMat->num_rows,
1609                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610                         loTriFactor->csrMat->num_entries,
1611                       #endif
1612                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1613                         loTriFactor->csrMat->values->data().get(),
1614                         loTriFactor->csrMat->row_offsets->data().get(),
1615                         loTriFactor->csrMat->column_indices->data().get(),
1616                         loTriFactor->solveInfo,
1617                         barray, tempGPU->data().get()
1618                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1620                       #endif
1621 );CHKERRCUSPARSE(stat);
1622 
1623   /* Next, solve U */
1624   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1625                         upTriFactor->csrMat->num_rows,
1626                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1627                         upTriFactor->csrMat->num_entries,
1628                       #endif
1629                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1630                         upTriFactor->csrMat->values->data().get(),
1631                         upTriFactor->csrMat->row_offsets->data().get(),
1632                         upTriFactor->csrMat->column_indices->data().get(),
1633                         upTriFactor->solveInfo,
1634                         tempGPU->data().get(), xarray
1635                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1636                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1637                       #endif
1638 );CHKERRCUSPARSE(stat);
1639 
1640   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1641   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1642   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1643   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1644   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1649 {
1650   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1651   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1652   cudaError_t        cerr;
1653   PetscErrorCode     ierr;
1654 
1655   PetscFunctionBegin;
1656   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1657     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1658 
1659     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1660     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1661     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1662     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1663     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1664     A->offloadmask = PETSC_OFFLOAD_BOTH;
1665   }
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1670 {
1671   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1672   PetscErrorCode ierr;
1673 
1674   PetscFunctionBegin;
1675   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1676   *array = a->a;
1677   A->offloadmask = PETSC_OFFLOAD_CPU;
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1682 {
1683   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1684   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1685   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1686   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1687   PetscErrorCode               ierr;
1688   cusparseStatus_t             stat;
1689   PetscBool                    both = PETSC_TRUE;
1690   cudaError_t                  err;
1691 
1692   PetscFunctionBegin;
1693   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1694   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1695     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1696       /* Copy values only */
1697       CsrMatrix *matrix,*matrixT;
1698       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1699 
1700       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1701       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1702       matrix->values->assign(a->a, a->a+a->nz);
1703       err  = WaitForCUDA();CHKERRCUDA(err);
1704       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1705       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1706 
1707       /* Update matT when it was built before */
1708       if (cusparsestruct->matTranspose) {
1709         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1710         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1711         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1712         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1713                             A->cmap->n, matrix->num_entries,
1714                             matrix->values->data().get(),
1715                             cusparsestruct->rowoffsets_gpu->data().get(),
1716                             matrix->column_indices->data().get(),
1717                             matrixT->values->data().get(),
1718                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1719                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1720                             CUSPARSE_ACTION_NUMERIC,indexBase,
1721                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1722                           #else
1723                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1724                             CUSPARSE_ACTION_NUMERIC, indexBase
1725                           #endif
1726 );CHKERRCUSPARSE(stat);
1727         err  = WaitForCUDA();CHKERRCUDA(err);
1728         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1729       }
1730     } else {
1731       PetscInt nnz;
1732       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1733       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1734       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1735       delete cusparsestruct->workVector;
1736       delete cusparsestruct->rowoffsets_gpu;
1737       try {
1738         if (a->compressedrow.use) {
1739           m    = a->compressedrow.nrows;
1740           ii   = a->compressedrow.i;
1741           ridx = a->compressedrow.rindex;
1742         } else {
1743           m    = A->rmap->n;
1744           ii   = a->i;
1745           ridx = NULL;
1746         }
1747         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1748         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1749         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1750         else nnz = a->nz;
1751 
1752         /* create cusparse matrix */
1753         cusparsestruct->nrows = m;
1754         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1755         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1756         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1757         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1758 
1759         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1760         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1761         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1762         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1763         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1764         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1765         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1766 
1767         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1768         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1769           /* set the matrix */
1770           CsrMatrix *mat= new CsrMatrix;
1771           mat->num_rows = m;
1772           mat->num_cols = A->cmap->n;
1773           mat->num_entries = nnz;
1774           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1775           mat->row_offsets->assign(ii, ii + m+1);
1776 
1777           mat->column_indices = new THRUSTINTARRAY32(nnz);
1778           mat->column_indices->assign(a->j, a->j+nnz);
1779 
1780           mat->values = new THRUSTARRAY(nnz);
1781           if (a->a) mat->values->assign(a->a, a->a+nnz);
1782 
1783           /* assign the pointer */
1784           matstruct->mat = mat;
1785          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1786           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1787             stat = cusparseCreateCsr(&matstruct->matDescr,
1788                                     mat->num_rows, mat->num_cols, mat->num_entries,
1789                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1790                                     mat->values->data().get(),
1791                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1792                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1793           }
1794          #endif
1795         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1796          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1797           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1798          #else
1799           CsrMatrix *mat= new CsrMatrix;
1800           mat->num_rows = m;
1801           mat->num_cols = A->cmap->n;
1802           mat->num_entries = nnz;
1803           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1804           mat->row_offsets->assign(ii, ii + m+1);
1805 
1806           mat->column_indices = new THRUSTINTARRAY32(nnz);
1807           mat->column_indices->assign(a->j, a->j+nnz);
1808 
1809           mat->values = new THRUSTARRAY(nnz);
1810           if (a->a) mat->values->assign(a->a, a->a+nnz);
1811 
1812           cusparseHybMat_t hybMat;
1813           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1814           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1815             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1816           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1817               matstruct->descr, mat->values->data().get(),
1818               mat->row_offsets->data().get(),
1819               mat->column_indices->data().get(),
1820               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1821           /* assign the pointer */
1822           matstruct->mat = hybMat;
1823 
1824           if (mat) {
1825             if (mat->values) delete (THRUSTARRAY*)mat->values;
1826             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1827             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1828             delete (CsrMatrix*)mat;
1829           }
1830          #endif
1831         }
1832 
1833         /* assign the compressed row indices */
1834         if (a->compressedrow.use) {
1835           cusparsestruct->workVector = new THRUSTARRAY(m);
1836           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1837           matstruct->cprowIndices->assign(ridx,ridx+m);
1838           tmp = m;
1839         } else {
1840           cusparsestruct->workVector = NULL;
1841           matstruct->cprowIndices    = NULL;
1842           tmp = 0;
1843         }
1844         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1845 
1846         /* assign the pointer */
1847         cusparsestruct->mat = matstruct;
1848       } catch(char *ex) {
1849         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1850       }
1851       err  = WaitForCUDA();CHKERRCUDA(err);
1852       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1853       cusparsestruct->nonzerostate = A->nonzerostate;
1854     }
1855     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1856   }
1857   PetscFunctionReturn(0);
1858 }
1859 
1860 struct VecCUDAPlusEquals
1861 {
1862   template <typename Tuple>
1863   __host__ __device__
1864   void operator()(Tuple t)
1865   {
1866     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1867   }
1868 };
1869 
1870 struct VecCUDAEquals
1871 {
1872   template <typename Tuple>
1873   __host__ __device__
1874   void operator()(Tuple t)
1875   {
1876     thrust::get<1>(t) = thrust::get<0>(t);
1877   }
1878 };
1879 
1880 struct VecCUDAEqualsReverse
1881 {
1882   template <typename Tuple>
1883   __host__ __device__
1884   void operator()(Tuple t)
1885   {
1886     thrust::get<0>(t) = thrust::get<1>(t);
1887   }
1888 };
1889 
1890 struct MatMatCusparse {
1891   PetscBool             cisdense;
1892   PetscScalar           *Bt;
1893   Mat                   X;
1894   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1895   PetscLogDouble        flops;
1896   CsrMatrix             *Bcsr;
1897 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898   cusparseSpMatDescr_t  matSpBDescr;
1899   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1900   cusparseDnMatDescr_t  matBDescr;
1901   cusparseDnMatDescr_t  matCDescr;
1902   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1903   size_t                mmBufferSize;
1904   void                  *mmBuffer;
1905   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1906   cusparseSpGEMMDescr_t spgemmDesc;
1907 #endif
1908 };
1909 
1910 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1911 {
1912   PetscErrorCode   ierr;
1913   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1914   cudaError_t      cerr;
1915  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1916   cusparseStatus_t stat;
1917  #endif
1918 
1919   PetscFunctionBegin;
1920   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1921   delete mmdata->Bcsr;
1922  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1923   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1924   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1925   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1926   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1927   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1928   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1929  #endif
1930   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1931   ierr = PetscFree(data);CHKERRQ(ierr);
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1936 
1937 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1938 {
1939   Mat_Product                  *product = C->product;
1940   Mat                          A,B;
1941   PetscInt                     m,n,blda,clda;
1942   PetscBool                    flg,biscuda;
1943   Mat_SeqAIJCUSPARSE           *cusp;
1944   cusparseStatus_t             stat;
1945   cusparseOperation_t          opA;
1946   const PetscScalar            *barray;
1947   PetscScalar                  *carray;
1948   PetscErrorCode               ierr;
1949   MatMatCusparse               *mmdata;
1950   Mat_SeqAIJCUSPARSEMultStruct *mat;
1951   CsrMatrix                    *csrmat;
1952   cudaError_t                  cerr;
1953 
1954   PetscFunctionBegin;
1955   MatCheckProduct(C,1);
1956   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1957   mmdata = (MatMatCusparse*)product->data;
1958   A    = product->A;
1959   B    = product->B;
1960   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1961   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1962   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1963      Instead of silently accepting the wrong answer, I prefer to raise the error */
1964   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1965   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1966   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1967   switch (product->type) {
1968   case MATPRODUCT_AB:
1969   case MATPRODUCT_PtAP:
1970     mat = cusp->mat;
1971     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1972     m   = A->rmap->n;
1973     n   = B->cmap->n;
1974     break;
1975   case MATPRODUCT_AtB:
1976     if (!cusp->transgen) {
1977       mat = cusp->mat;
1978       opA = CUSPARSE_OPERATION_TRANSPOSE;
1979     } else {
1980       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1981       mat  = cusp->matTranspose;
1982       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1983     }
1984     m = A->cmap->n;
1985     n = B->cmap->n;
1986     break;
1987   case MATPRODUCT_ABt:
1988   case MATPRODUCT_RARt:
1989     mat = cusp->mat;
1990     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1991     m   = A->rmap->n;
1992     n   = B->rmap->n;
1993     break;
1994   default:
1995     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1996   }
1997   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1998   csrmat = (CsrMatrix*)mat->mat;
1999   /* if the user passed a CPU matrix, copy the data to the GPU */
2000   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2001   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2002   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2003 
2004   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2005   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2006     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2007     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2008   } else {
2009     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2010     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2011   }
2012 
2013   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2014  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2015   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2016   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2017   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2018     size_t mmBufferSize;
2019     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2020     if (!mmdata->matBDescr) {
2021       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2022       mmdata->Blda = blda;
2023     }
2024 
2025     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2026     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2027       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2028       mmdata->Clda = clda;
2029     }
2030 
2031     if (!mat->matDescr) {
2032       stat = cusparseCreateCsr(&mat->matDescr,
2033                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2034                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2035                                csrmat->values->data().get(),
2036                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2037                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2038     }
2039     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2040                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2041                                    mmdata->matCDescr,cusparse_scalartype,
2042                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2043     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2044       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2045       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2046       mmdata->mmBufferSize = mmBufferSize;
2047     }
2048     mmdata->initialized = PETSC_TRUE;
2049   } else {
2050     /* to be safe, always update pointers of the mats */
2051     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2052     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2053     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2054   }
2055 
2056   /* do cusparseSpMM, which supports transpose on B */
2057   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2058                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2059                       mmdata->matCDescr,cusparse_scalartype,
2060                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2061  #else
2062   PetscInt k;
2063   /* cusparseXcsrmm does not support transpose on B */
2064   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2065     cublasHandle_t cublasv2handle;
2066     cublasStatus_t cerr;
2067 
2068     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2069     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2070                        B->cmap->n,B->rmap->n,
2071                        &PETSC_CUSPARSE_ONE ,barray,blda,
2072                        &PETSC_CUSPARSE_ZERO,barray,blda,
2073                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2074     blda = B->cmap->n;
2075     k    = B->cmap->n;
2076   } else {
2077     k    = B->rmap->n;
2078   }
2079 
2080   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2081   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2082                            csrmat->num_entries,mat->alpha_one,mat->descr,
2083                            csrmat->values->data().get(),
2084                            csrmat->row_offsets->data().get(),
2085                            csrmat->column_indices->data().get(),
2086                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2087                            carray,clda);CHKERRCUSPARSE(stat);
2088  #endif
2089   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2090   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2091   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2092   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2093   if (product->type == MATPRODUCT_RARt) {
2094     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2095     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2096   } else if (product->type == MATPRODUCT_PtAP) {
2097     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2098     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2099   } else {
2100     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2101   }
2102   if (mmdata->cisdense) {
2103     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2104   }
2105   if (!biscuda) {
2106     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2107   }
2108   PetscFunctionReturn(0);
2109 }
2110 
2111 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2112 {
2113   Mat_Product        *product = C->product;
2114   Mat                A,B;
2115   PetscInt           m,n;
2116   PetscBool          cisdense,flg;
2117   PetscErrorCode     ierr;
2118   MatMatCusparse     *mmdata;
2119   Mat_SeqAIJCUSPARSE *cusp;
2120 
2121   PetscFunctionBegin;
2122   MatCheckProduct(C,1);
2123   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2124   A    = product->A;
2125   B    = product->B;
2126   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2127   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2128   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2129   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2130   switch (product->type) {
2131   case MATPRODUCT_AB:
2132     m = A->rmap->n;
2133     n = B->cmap->n;
2134     break;
2135   case MATPRODUCT_AtB:
2136     m = A->cmap->n;
2137     n = B->cmap->n;
2138     break;
2139   case MATPRODUCT_ABt:
2140     m = A->rmap->n;
2141     n = B->rmap->n;
2142     break;
2143   case MATPRODUCT_PtAP:
2144     m = B->cmap->n;
2145     n = B->cmap->n;
2146     break;
2147   case MATPRODUCT_RARt:
2148     m = B->rmap->n;
2149     n = B->rmap->n;
2150     break;
2151   default:
2152     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2153   }
2154   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2155   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2156   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2157   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2158 
2159   /* product data */
2160   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2161   mmdata->cisdense = cisdense;
2162  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2163   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2164   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2165     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2166   }
2167  #endif
2168   /* for these products we need intermediate storage */
2169   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2170     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2171     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2172     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2173       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2174     } else {
2175       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2176     }
2177   }
2178   C->product->data    = mmdata;
2179   C->product->destroy = MatDestroy_MatMatCusparse;
2180 
2181   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2182   PetscFunctionReturn(0);
2183 }
2184 
2185 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2186 {
2187   Mat_Product                  *product = C->product;
2188   Mat                          A,B;
2189   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2190   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2191   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2192   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2193   PetscBool                    flg;
2194   PetscErrorCode               ierr;
2195   cusparseStatus_t             stat;
2196   cudaError_t                  cerr;
2197   MatProductType               ptype;
2198   MatMatCusparse               *mmdata;
2199 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2200   cusparseSpMatDescr_t         BmatSpDescr;
2201 #endif
2202 
2203   PetscFunctionBegin;
2204   MatCheckProduct(C,1);
2205   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2206   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2207   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2208   mmdata = (MatMatCusparse*)C->product->data;
2209   A = product->A;
2210   B = product->B;
2211   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2212     mmdata->reusesym = PETSC_FALSE;
2213     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2214     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2215     Cmat = Ccusp->mat;
2216     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2217     Ccsr = (CsrMatrix*)Cmat->mat;
2218     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2219     goto finalize;
2220   }
2221   if (!c->nz) goto finalize;
2222   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2223   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2224   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2225   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2226   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2227   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2228   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2229   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2230   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2231   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2232   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2233   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2234   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2235   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2236 
2237   ptype = product->type;
2238   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2239   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2240   switch (ptype) {
2241   case MATPRODUCT_AB:
2242     Amat = Acusp->mat;
2243     Bmat = Bcusp->mat;
2244     break;
2245   case MATPRODUCT_AtB:
2246     Amat = Acusp->matTranspose;
2247     Bmat = Bcusp->mat;
2248     break;
2249   case MATPRODUCT_ABt:
2250     Amat = Acusp->mat;
2251     Bmat = Bcusp->matTranspose;
2252     break;
2253   default:
2254     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2255   }
2256   Cmat = Ccusp->mat;
2257   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2258   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2259   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2260   Acsr = (CsrMatrix*)Amat->mat;
2261   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2262   Ccsr = (CsrMatrix*)Cmat->mat;
2263   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2264   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2265   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2266   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2268   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2269   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2270                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2271                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2272                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2273   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2274                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2275                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2276 #else
2277   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2278                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2279                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2280                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2281                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2282 #endif
2283   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2284   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2285   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2286   C->offloadmask = PETSC_OFFLOAD_GPU;
2287 finalize:
2288   /* shorter version of MatAssemblyEnd_SeqAIJ */
2289   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2290   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2291   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2292   c->reallocs         = 0;
2293   C->info.mallocs    += 0;
2294   C->info.nz_unneeded = 0;
2295   C->assembled = C->was_assembled = PETSC_TRUE;
2296   C->num_ass++;
2297   PetscFunctionReturn(0);
2298 }
2299 
2300 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2301 {
2302   Mat_Product                  *product = C->product;
2303   Mat                          A,B;
2304   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2305   Mat_SeqAIJ                   *a,*b,*c;
2306   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2307   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2308   PetscInt                     i,j,m,n,k;
2309   PetscBool                    flg;
2310   PetscErrorCode               ierr;
2311   cusparseStatus_t             stat;
2312   cudaError_t                  cerr;
2313   MatProductType               ptype;
2314   MatMatCusparse               *mmdata;
2315   PetscLogDouble               flops;
2316   PetscBool                    biscompressed,ciscompressed;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2319   size_t                       bufSize2;
2320   cusparseSpMatDescr_t         BmatSpDescr;
2321 #else
2322   int                          cnz;
2323 #endif
2324 
2325   PetscFunctionBegin;
2326   MatCheckProduct(C,1);
2327   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2328   A    = product->A;
2329   B    = product->B;
2330   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2331   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2332   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2333   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2334   a = (Mat_SeqAIJ*)A->data;
2335   b = (Mat_SeqAIJ*)B->data;
2336   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2337   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2338   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2339   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2340 
2341   /* product data */
2342   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2343   C->product->data    = mmdata;
2344   C->product->destroy = MatDestroy_MatMatCusparse;
2345 
2346   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2347   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2348   ptype = product->type;
2349   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2350   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2351   biscompressed = PETSC_FALSE;
2352   ciscompressed = PETSC_FALSE;
2353   switch (ptype) {
2354   case MATPRODUCT_AB:
2355     m = A->rmap->n;
2356     n = B->cmap->n;
2357     k = A->cmap->n;
2358     Amat = Acusp->mat;
2359     Bmat = Bcusp->mat;
2360     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2361     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2362     break;
2363   case MATPRODUCT_AtB:
2364     m = A->cmap->n;
2365     n = B->cmap->n;
2366     k = A->rmap->n;
2367     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2368     Amat = Acusp->matTranspose;
2369     Bmat = Bcusp->mat;
2370     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2371     break;
2372   case MATPRODUCT_ABt:
2373     m = A->rmap->n;
2374     n = B->rmap->n;
2375     k = A->cmap->n;
2376     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2377     Amat = Acusp->mat;
2378     Bmat = Bcusp->matTranspose;
2379     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2380     break;
2381   default:
2382     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2383   }
2384 
2385   /* create cusparse matrix */
2386   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2387   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2388   c     = (Mat_SeqAIJ*)C->data;
2389   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2390   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2391   Ccsr  = new CsrMatrix;
2392 
2393   c->compressedrow.use = ciscompressed;
2394   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2395     c->compressedrow.nrows = a->compressedrow.nrows;
2396     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2397     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2398     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2399     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2400     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2401   } else {
2402     c->compressedrow.nrows  = 0;
2403     c->compressedrow.i      = NULL;
2404     c->compressedrow.rindex = NULL;
2405     Ccusp->workVector       = NULL;
2406     Cmat->cprowIndices      = NULL;
2407   }
2408   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2409   Ccusp->mat      = Cmat;
2410   Ccusp->mat->mat = Ccsr;
2411   Ccsr->num_rows    = Ccusp->nrows;
2412   Ccsr->num_cols    = n;
2413   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2414   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2415   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2416   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2417   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2418   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2419   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2420   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2421   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2422   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2423   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2424     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2425     c->nz = 0;
2426     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2427     Ccsr->values = new THRUSTARRAY(c->nz);
2428     goto finalizesym;
2429   }
2430 
2431   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2432   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2433   Acsr = (CsrMatrix*)Amat->mat;
2434   if (!biscompressed) {
2435     Bcsr = (CsrMatrix*)Bmat->mat;
2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2437     BmatSpDescr = Bmat->matDescr;
2438 #endif
2439   } else { /* we need to use row offsets for the full matrix */
2440     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2441     Bcsr = new CsrMatrix;
2442     Bcsr->num_rows       = B->rmap->n;
2443     Bcsr->num_cols       = cBcsr->num_cols;
2444     Bcsr->num_entries    = cBcsr->num_entries;
2445     Bcsr->column_indices = cBcsr->column_indices;
2446     Bcsr->values         = cBcsr->values;
2447     if (!Bcusp->rowoffsets_gpu) {
2448       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2449       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2450       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2451     }
2452     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2453     mmdata->Bcsr = Bcsr;
2454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2455     if (Bcsr->num_rows && Bcsr->num_cols) {
2456       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2457                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2458                                Bcsr->values->data().get(),
2459                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2460                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2461     }
2462     BmatSpDescr = mmdata->matSpBDescr;
2463 #endif
2464   }
2465   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2466   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2467   /* precompute flops count */
2468   if (ptype == MATPRODUCT_AB) {
2469     for (i=0, flops = 0; i<A->rmap->n; i++) {
2470       const PetscInt st = a->i[i];
2471       const PetscInt en = a->i[i+1];
2472       for (j=st; j<en; j++) {
2473         const PetscInt brow = a->j[j];
2474         flops += 2.*(b->i[brow+1] - b->i[brow]);
2475       }
2476     }
2477   } else if (ptype == MATPRODUCT_AtB) {
2478     for (i=0, flops = 0; i<A->rmap->n; i++) {
2479       const PetscInt anzi = a->i[i+1] - a->i[i];
2480       const PetscInt bnzi = b->i[i+1] - b->i[i];
2481       flops += (2.*anzi)*bnzi;
2482     }
2483   } else { /* TODO */
2484     flops = 0.;
2485   }
2486 
2487   mmdata->flops = flops;
2488   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2491   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2492                            NULL, NULL, NULL,
2493                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2494                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2495   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2496   /* ask bufferSize bytes for external memory */
2497   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2498                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2499                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2500                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2501   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2502   /* inspect the matrices A and B to understand the memory requirement for the next step */
2503   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2504                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2505                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2506                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2507   /* ask bufferSize again bytes for external memory */
2508   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2509                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2510                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2511                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2512   /* The CUSPARSE documentation is not clear, nor the API
2513      We need both buffers to perform the operations properly!
2514      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2515      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2516      is stored in the descriptor! What a messy API... */
2517   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2518   /* compute the intermediate product of A * B */
2519   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2520                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2521                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2522                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2523   /* get matrix C non-zero entries C_nnz1 */
2524   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2525   c->nz = (PetscInt) C_nnz1;
2526   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2527   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2528   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2529   Ccsr->values = new THRUSTARRAY(c->nz);
2530   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2531   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2532                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2533   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2534                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2535                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2536 #else
2537   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2538   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2540                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2541                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2542                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2543   c->nz = cnz;
2544   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2545   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2546   Ccsr->values = new THRUSTARRAY(c->nz);
2547   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2548 
2549   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2550   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2551      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2552      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2553   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2554                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2555                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2556                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2557                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2558 #endif
2559   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2560   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2561   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2562 finalizesym:
2563   c->singlemalloc = PETSC_FALSE;
2564   c->free_a       = PETSC_TRUE;
2565   c->free_ij      = PETSC_TRUE;
2566   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2567   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2568   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2569     PetscInt *d_i = c->i;
2570     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2571     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2572     ii   = *Ccsr->row_offsets;
2573     jj   = *Ccsr->column_indices;
2574     if (ciscompressed) d_i = c->compressedrow.i;
2575     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2576     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2577   } else {
2578     PetscInt *d_i = c->i;
2579     if (ciscompressed) d_i = c->compressedrow.i;
2580     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2581     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2582   }
2583   if (ciscompressed) { /* need to expand host row offsets */
2584     PetscInt r = 0;
2585     c->i[0] = 0;
2586     for (k = 0; k < c->compressedrow.nrows; k++) {
2587       const PetscInt next = c->compressedrow.rindex[k];
2588       const PetscInt old = c->compressedrow.i[k];
2589       for (; r < next; r++) c->i[r+1] = old;
2590     }
2591     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2592   }
2593   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2594   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2595   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2596   c->maxnz = c->nz;
2597   c->nonzerorowcnt = 0;
2598   c->rmax = 0;
2599   for (k = 0; k < m; k++) {
2600     const PetscInt nn = c->i[k+1] - c->i[k];
2601     c->ilen[k] = c->imax[k] = nn;
2602     c->nonzerorowcnt += (PetscInt)!!nn;
2603     c->rmax = PetscMax(c->rmax,nn);
2604   }
2605   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2606   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2607   Ccsr->num_entries = c->nz;
2608 
2609   C->nonzerostate++;
2610   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2611   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2612   Ccusp->nonzerostate = C->nonzerostate;
2613   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2614   C->preallocated  = PETSC_TRUE;
2615   C->assembled     = PETSC_FALSE;
2616   C->was_assembled = PETSC_FALSE;
2617   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2618     mmdata->reusesym = PETSC_TRUE;
2619     C->offloadmask   = PETSC_OFFLOAD_GPU;
2620   }
2621   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2622   PetscFunctionReturn(0);
2623 }
2624 
2625 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2626 
2627 /* handles sparse or dense B */
2628 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2629 {
2630   Mat_Product    *product = mat->product;
2631   PetscErrorCode ierr;
2632   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2633 
2634   PetscFunctionBegin;
2635   MatCheckProduct(mat,1);
2636   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2637   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2638     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2639   }
2640   if (product->type == MATPRODUCT_ABC) {
2641     Ciscusp = PETSC_FALSE;
2642     if (!product->C->boundtocpu) {
2643       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2644     }
2645   }
2646   if (isdense) {
2647     switch (product->type) {
2648     case MATPRODUCT_AB:
2649     case MATPRODUCT_AtB:
2650     case MATPRODUCT_ABt:
2651     case MATPRODUCT_PtAP:
2652     case MATPRODUCT_RARt:
2653      if (product->A->boundtocpu) {
2654         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2655       } else {
2656         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2657       }
2658       break;
2659     case MATPRODUCT_ABC:
2660       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2661       break;
2662     default:
2663       break;
2664     }
2665   } else if (Biscusp && Ciscusp) {
2666     switch (product->type) {
2667     case MATPRODUCT_AB:
2668     case MATPRODUCT_AtB:
2669     case MATPRODUCT_ABt:
2670       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2671       break;
2672     case MATPRODUCT_PtAP:
2673     case MATPRODUCT_RARt:
2674     case MATPRODUCT_ABC:
2675       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2676       break;
2677     default:
2678       break;
2679     }
2680   } else { /* fallback for AIJ */
2681     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2682   }
2683   PetscFunctionReturn(0);
2684 }
2685 
2686 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2687 {
2688   PetscErrorCode ierr;
2689 
2690   PetscFunctionBegin;
2691   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2692   PetscFunctionReturn(0);
2693 }
2694 
2695 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2696 {
2697   PetscErrorCode ierr;
2698 
2699   PetscFunctionBegin;
2700   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2701   PetscFunctionReturn(0);
2702 }
2703 
2704 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2705 {
2706   PetscErrorCode ierr;
2707 
2708   PetscFunctionBegin;
2709   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2714 {
2715   PetscErrorCode ierr;
2716 
2717   PetscFunctionBegin;
2718   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2719   PetscFunctionReturn(0);
2720 }
2721 
2722 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2723 {
2724   PetscErrorCode ierr;
2725 
2726   PetscFunctionBegin;
2727   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2728   PetscFunctionReturn(0);
2729 }
2730 
2731 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2732 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2733 {
2734   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2735   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2736   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2737   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2738   PetscErrorCode               ierr;
2739   cudaError_t                  cerr;
2740   cusparseStatus_t             stat;
2741   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2742   PetscBool                    compressed;
2743 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2744   PetscInt                     nx,ny;
2745 #endif
2746 
2747   PetscFunctionBegin;
2748   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2749   if (!a->nonzerorowcnt) {
2750     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2751     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2752     PetscFunctionReturn(0);
2753   }
2754   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2755   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2756   if (!trans) {
2757     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2758     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2759   } else {
2760     if (herm || !cusparsestruct->transgen) {
2761       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2762       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2763     } else {
2764       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2765       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2766     }
2767   }
2768   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2769   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2770 
2771   try {
2772     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2773     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2774     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2775 
2776     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2777     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2778       /* z = A x + beta y.
2779          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2780          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2781       */
2782       xptr = xarray;
2783       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2784       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2785      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2786       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2787           allocated to accommodate different uses. So we get the length info directly from mat.
2788        */
2789       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2790         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2791         nx = mat->num_cols;
2792         ny = mat->num_rows;
2793       }
2794      #endif
2795     } else {
2796       /* z = A^T x + beta y
2797          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2798          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2799        */
2800       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2801       dptr = zarray;
2802       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2803       if (compressed) { /* Scatter x to work vector */
2804         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2805         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2806                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2807                          VecCUDAEqualsReverse());
2808       }
2809      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2810       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2811         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2812         nx = mat->num_rows;
2813         ny = mat->num_cols;
2814       }
2815      #endif
2816     }
2817 
2818     /* csr_spmv does y = alpha op(A) x + beta y */
2819     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2820      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2821       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2822       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2823         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2824         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2825         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2826                                 matstruct->matDescr,
2827                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2828                                 matstruct->cuSpMV[opA].vecYDescr,
2829                                 cusparse_scalartype,
2830                                 cusparsestruct->spmvAlg,
2831                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2832         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2833 
2834         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2835       } else {
2836         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2837         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2838         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2839       }
2840 
2841       stat = cusparseSpMV(cusparsestruct->handle, opA,
2842                                matstruct->alpha_one,
2843                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2844                                matstruct->cuSpMV[opA].vecXDescr,
2845                                beta,
2846                                matstruct->cuSpMV[opA].vecYDescr,
2847                                cusparse_scalartype,
2848                                cusparsestruct->spmvAlg,
2849                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2850      #else
2851       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2852       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2853                                mat->num_rows, mat->num_cols,
2854                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2855                                mat->values->data().get(), mat->row_offsets->data().get(),
2856                                mat->column_indices->data().get(), xptr, beta,
2857                                dptr);CHKERRCUSPARSE(stat);
2858      #endif
2859     } else {
2860       if (cusparsestruct->nrows) {
2861        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2862         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2863        #else
2864         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2865         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2866                                  matstruct->alpha_one, matstruct->descr, hybMat,
2867                                  xptr, beta,
2868                                  dptr);CHKERRCUSPARSE(stat);
2869        #endif
2870       }
2871     }
2872     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2873     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2874 
2875     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2876       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2877         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2878           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2879         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2880           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2881         }
2882       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2883         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2884       }
2885 
2886       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2887       if (compressed) {
2888         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2889         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2890         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2891                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2892                          VecCUDAPlusEquals());
2893         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2894         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2895       }
2896     } else {
2897       if (yy && yy != zz) {
2898         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2899       }
2900     }
2901     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2902     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2903     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2904   } catch(char *ex) {
2905     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2906   }
2907   if (yy) {
2908     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2909   } else {
2910     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2911   }
2912   PetscFunctionReturn(0);
2913 }
2914 
2915 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2916 {
2917   PetscErrorCode ierr;
2918 
2919   PetscFunctionBegin;
2920   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2921   PetscFunctionReturn(0);
2922 }
2923 
2924 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2925 {
2926   PetscErrorCode              ierr;
2927   PetscSplitCSRDataStructure  *d_mat = NULL;
2928   PetscFunctionBegin;
2929   if (A->factortype == MAT_FACTOR_NONE) {
2930     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2931   }
2932   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2933   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2934   if (d_mat) {
2935     A->offloadmask = PETSC_OFFLOAD_GPU;
2936   }
2937 
2938   PetscFunctionReturn(0);
2939 }
2940 
2941 /* --------------------------------------------------------------------------------*/
2942 /*@
2943    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2944    (the default parallel PETSc format). This matrix will ultimately pushed down
2945    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2946    assembly performance the user should preallocate the matrix storage by setting
2947    the parameter nz (or the array nnz).  By setting these parameters accurately,
2948    performance during matrix assembly can be increased by more than a factor of 50.
2949 
2950    Collective
2951 
2952    Input Parameters:
2953 +  comm - MPI communicator, set to PETSC_COMM_SELF
2954 .  m - number of rows
2955 .  n - number of columns
2956 .  nz - number of nonzeros per row (same for all rows)
2957 -  nnz - array containing the number of nonzeros in the various rows
2958          (possibly different for each row) or NULL
2959 
2960    Output Parameter:
2961 .  A - the matrix
2962 
2963    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2964    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2965    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2966 
2967    Notes:
2968    If nnz is given then nz is ignored
2969 
2970    The AIJ format (also called the Yale sparse matrix format or
2971    compressed row storage), is fully compatible with standard Fortran 77
2972    storage.  That is, the stored row and column indices can begin at
2973    either one (as in Fortran) or zero.  See the users' manual for details.
2974 
2975    Specify the preallocated storage with either nz or nnz (not both).
2976    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2977    allocation.  For large problems you MUST preallocate memory or you
2978    will get TERRIBLE performance, see the users' manual chapter on matrices.
2979 
2980    By default, this format uses inodes (identical nodes) when possible, to
2981    improve numerical efficiency of matrix-vector products and solves. We
2982    search for consecutive rows with the same nonzero structure, thereby
2983    reusing matrix information to achieve increased efficiency.
2984 
2985    Level: intermediate
2986 
2987 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2988 @*/
2989 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2990 {
2991   PetscErrorCode ierr;
2992 
2993   PetscFunctionBegin;
2994   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2995   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2996   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2997   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2998   PetscFunctionReturn(0);
2999 }
3000 
3001 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3002 {
3003   PetscErrorCode              ierr;
3004   PetscSplitCSRDataStructure  *d_mat = NULL;
3005 
3006   PetscFunctionBegin;
3007   if (A->factortype == MAT_FACTOR_NONE) {
3008     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3009     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3010     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3011   } else {
3012     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3013   }
3014   if (d_mat) {
3015     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3016     cudaError_t                err;
3017     PetscSplitCSRDataStructure h_mat;
3018     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3019     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3020     if (a->compressedrow.use) {
3021       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3022     }
3023     err = cudaFree(d_mat);CHKERRCUDA(err);
3024   }
3025   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3026   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3027   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3028   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3029   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3030   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3031   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3032   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3033   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3034   PetscFunctionReturn(0);
3035 }
3036 
3037 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3038 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3039 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3040 {
3041   PetscErrorCode ierr;
3042 
3043   PetscFunctionBegin;
3044   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3045   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3046   PetscFunctionReturn(0);
3047 }
3048 
3049 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3050 {
3051   PetscErrorCode     ierr;
3052   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3053   Mat_SeqAIJCUSPARSE *cy;
3054   Mat_SeqAIJCUSPARSE *cx;
3055   PetscScalar        *ay;
3056   const PetscScalar  *ax;
3057   CsrMatrix          *csry,*csrx;
3058   cudaError_t        cerr;
3059 
3060   PetscFunctionBegin;
3061   if (X->ops->axpy != Y->ops->axpy) {
3062     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3063     PetscFunctionReturn(0);
3064   }
3065   /* if we are here, it means both matrices are bound to GPU */
3066   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3067   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3068   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3069   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3070   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3071   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3072   csry = (CsrMatrix*)cy->mat->mat;
3073   csrx = (CsrMatrix*)cx->mat->mat;
3074   /* see if we can turn this into a cublas axpy */
3075   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3076     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3077     if (eq) {
3078       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3079     }
3080     if (eq) str = SAME_NONZERO_PATTERN;
3081   }
3082 
3083   if (str == SUBSET_NONZERO_PATTERN) {
3084     cusparseStatus_t stat;
3085     PetscScalar      b = 1.0;
3086 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3087     size_t           bufferSize;
3088     void             *buffer;
3089 #endif
3090 
3091     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3092     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3093     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3094 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3095     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3096                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3097                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3098                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3099     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3100     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3101     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3102                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3103                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3104                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3105     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3106     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3107     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3108     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3109 #else
3110     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3111     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3112                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3113                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3114                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3115     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3116     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3117     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3118 #endif
3119     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3120     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3121     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3122     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3123   } else if (str == SAME_NONZERO_PATTERN) {
3124     cublasHandle_t cublasv2handle;
3125     cublasStatus_t berr;
3126     PetscBLASInt   one = 1, bnz = 1;
3127 
3128     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3129     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3130     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3131     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3132     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3133     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3134     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3135     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3136     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3137     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3138     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3139     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3140   } else {
3141     ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
3142   }
3143   PetscFunctionReturn(0);
3144 }
3145 
3146 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3147 {
3148   PetscErrorCode             ierr;
3149   PetscBool                  both = PETSC_FALSE;
3150   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3151 
3152   PetscFunctionBegin;
3153   if (A->factortype == MAT_FACTOR_NONE) {
3154     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3155     if (spptr->mat) {
3156       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3157       if (matrix->values) {
3158         both = PETSC_TRUE;
3159         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3160       }
3161     }
3162     if (spptr->matTranspose) {
3163       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3164       if (matrix->values) {
3165         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3166       }
3167     }
3168   }
3169   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3170   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3171   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3172   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3173   else A->offloadmask = PETSC_OFFLOAD_CPU;
3174 
3175   PetscFunctionReturn(0);
3176 }
3177 
3178 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3179 {
3180   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3181   PetscErrorCode ierr;
3182 
3183   PetscFunctionBegin;
3184   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3185   if (flg) {
3186     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3187 
3188     A->ops->axpy                      = MatAXPY_SeqAIJ;
3189     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3190     A->ops->mult                      = MatMult_SeqAIJ;
3191     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3192     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3193     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3194     A->ops->multhermitiantranspose    = NULL;
3195     A->ops->multhermitiantransposeadd = NULL;
3196     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3197     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3198     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3199     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3200     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3201     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3202     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3203     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3204   } else {
3205     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3206     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3207     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3208     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3209     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3210     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3211     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3212     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3213     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3214     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3215     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3216     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3217     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3218     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3219     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3220     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3221   }
3222   A->boundtocpu = flg;
3223   a->inode.use = flg;
3224   PetscFunctionReturn(0);
3225 }
3226 
3227 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3228 {
3229   PetscErrorCode   ierr;
3230   cusparseStatus_t stat;
3231   Mat              B;
3232 
3233   PetscFunctionBegin;
3234   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3235   if (reuse == MAT_INITIAL_MATRIX) {
3236     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3237   } else if (reuse == MAT_REUSE_MATRIX) {
3238     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3239   }
3240   B = *newmat;
3241 
3242   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3243   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3244 
3245   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3246     if (B->factortype == MAT_FACTOR_NONE) {
3247       Mat_SeqAIJCUSPARSE *spptr;
3248 
3249       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3250       spptr->format = MAT_CUSPARSE_CSR;
3251       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3252       B->spptr = spptr;
3253       spptr->deviceMat = NULL;
3254     } else {
3255       Mat_SeqAIJCUSPARSETriFactors *spptr;
3256 
3257       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3258       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3259       B->spptr = spptr;
3260     }
3261     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3262   }
3263   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3264   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3265   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3266   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3267   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3268 
3269   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3270   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3271   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3272   PetscFunctionReturn(0);
3273 }
3274 
3275 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3276 {
3277   PetscErrorCode ierr;
3278 
3279   PetscFunctionBegin;
3280   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3281   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3282   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3283   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3284   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 /*MC
3289    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3290 
3291    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3292    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3293    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3294 
3295    Options Database Keys:
3296 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3297 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3298 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3299 
3300   Level: beginner
3301 
3302 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3303 M*/
3304 
3305 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3306 
3307 
3308 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3309 {
3310   PetscErrorCode ierr;
3311 
3312   PetscFunctionBegin;
3313   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3314   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3315   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3316   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3317   PetscFunctionReturn(0);
3318 }
3319 
3320 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3321 {
3322   PetscErrorCode   ierr;
3323   cusparseStatus_t stat;
3324 
3325   PetscFunctionBegin;
3326   if (*cusparsestruct) {
3327     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3328     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3329     delete (*cusparsestruct)->workVector;
3330     delete (*cusparsestruct)->rowoffsets_gpu;
3331     delete (*cusparsestruct)->cooPerm;
3332     delete (*cusparsestruct)->cooPerm_a;
3333     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3334    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3335     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3336    #endif
3337     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3338   }
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3343 {
3344   PetscFunctionBegin;
3345   if (*mat) {
3346     delete (*mat)->values;
3347     delete (*mat)->column_indices;
3348     delete (*mat)->row_offsets;
3349     delete *mat;
3350     *mat = 0;
3351   }
3352   PetscFunctionReturn(0);
3353 }
3354 
3355 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3356 {
3357   cusparseStatus_t stat;
3358   PetscErrorCode   ierr;
3359 
3360   PetscFunctionBegin;
3361   if (*trifactor) {
3362     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3363     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3364     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3365     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3366     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3367    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3368     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3369    #endif
3370     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3371   }
3372   PetscFunctionReturn(0);
3373 }
3374 
3375 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3376 {
3377   CsrMatrix        *mat;
3378   cusparseStatus_t stat;
3379   cudaError_t      err;
3380 
3381   PetscFunctionBegin;
3382   if (*matstruct) {
3383     if ((*matstruct)->mat) {
3384       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3385        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3386         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3387        #else
3388         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3389         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3390        #endif
3391       } else {
3392         mat = (CsrMatrix*)(*matstruct)->mat;
3393         CsrMatrix_Destroy(&mat);
3394       }
3395     }
3396     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3397     delete (*matstruct)->cprowIndices;
3398     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3399     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3400     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3401 
3402    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3403     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3404     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3405     for (int i=0; i<3; i++) {
3406       if (mdata->cuSpMV[i].initialized) {
3407         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3408         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3409         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3410       }
3411     }
3412    #endif
3413     delete *matstruct;
3414     *matstruct = NULL;
3415   }
3416   PetscFunctionReturn(0);
3417 }
3418 
3419 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3420 {
3421   PetscErrorCode ierr;
3422 
3423   PetscFunctionBegin;
3424   if (*trifactors) {
3425     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3426     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3427     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3428     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3429     delete (*trifactors)->rpermIndices;
3430     delete (*trifactors)->cpermIndices;
3431     delete (*trifactors)->workVector;
3432     (*trifactors)->rpermIndices = NULL;
3433     (*trifactors)->cpermIndices = NULL;
3434     (*trifactors)->workVector = NULL;
3435   }
3436   PetscFunctionReturn(0);
3437 }
3438 
3439 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3440 {
3441   PetscErrorCode   ierr;
3442   cusparseHandle_t handle;
3443   cusparseStatus_t stat;
3444 
3445   PetscFunctionBegin;
3446   if (*trifactors) {
3447     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3448     if (handle = (*trifactors)->handle) {
3449       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3450     }
3451     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3452   }
3453   PetscFunctionReturn(0);
3454 }
3455 
3456 struct IJCompare
3457 {
3458   __host__ __device__
3459   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3460   {
3461     if (t1.get<0>() < t2.get<0>()) return true;
3462     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3463     return false;
3464   }
3465 };
3466 
3467 struct IJEqual
3468 {
3469   __host__ __device__
3470   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3471   {
3472     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3473     return true;
3474   }
3475 };
3476 
3477 struct IJDiff
3478 {
3479   __host__ __device__
3480   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3481   {
3482     return t1 == t2 ? 0 : 1;
3483   }
3484 };
3485 
3486 struct IJSum
3487 {
3488   __host__ __device__
3489   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3490   {
3491     return t1||t2;
3492   }
3493 };
3494 
3495 #include <thrust/iterator/discard_iterator.h>
3496 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3497 {
3498   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3499   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3500   THRUSTARRAY                           *cooPerm_v = NULL;
3501   thrust::device_ptr<const PetscScalar> d_v;
3502   CsrMatrix                             *matrix;
3503   PetscErrorCode                        ierr;
3504   cudaError_t                           cerr;
3505   PetscInt                              n;
3506 
3507   PetscFunctionBegin;
3508   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3509   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3510   if (!cusp->cooPerm) {
3511     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3512     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3513     PetscFunctionReturn(0);
3514   }
3515   matrix = (CsrMatrix*)cusp->mat->mat;
3516   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3517   if (!v) {
3518     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3519     goto finalize;
3520   }
3521   n = cusp->cooPerm->size();
3522   if (isCudaMem(v)) {
3523     d_v = thrust::device_pointer_cast(v);
3524   } else {
3525     cooPerm_v = new THRUSTARRAY(n);
3526     cooPerm_v->assign(v,v+n);
3527     d_v = cooPerm_v->data();
3528     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3529   }
3530   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3531   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3532     if (cusp->cooPerm_a) {
3533       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3534       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3535       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3536       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3537       delete cooPerm_w;
3538     } else {
3539       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3540                                                                 matrix->values->begin()));
3541       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3542                                                                 matrix->values->end()));
3543       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3544     }
3545   } else {
3546     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3547       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3548       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3549     } else {
3550       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3551                                                                 matrix->values->begin()));
3552       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3553                                                                 matrix->values->end()));
3554       thrust::for_each(zibit,zieit,VecCUDAEquals());
3555     }
3556   }
3557   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3558   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3559 finalize:
3560   delete cooPerm_v;
3561   A->offloadmask = PETSC_OFFLOAD_GPU;
3562   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3563   /* shorter version of MatAssemblyEnd_SeqAIJ */
3564   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3565   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3566   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3567   a->reallocs         = 0;
3568   A->info.mallocs    += 0;
3569   A->info.nz_unneeded = 0;
3570   A->assembled = A->was_assembled = PETSC_TRUE;
3571   A->num_ass++;
3572   PetscFunctionReturn(0);
3573 }
3574 
3575 #include <thrust/binary_search.h>
3576 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3577 {
3578   PetscErrorCode     ierr;
3579   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3580   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3581   PetscInt           cooPerm_n, nzr = 0;
3582   cudaError_t        cerr;
3583 
3584   PetscFunctionBegin;
3585   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3586   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3587   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3588   if (n != cooPerm_n) {
3589     delete cusp->cooPerm;
3590     delete cusp->cooPerm_a;
3591     cusp->cooPerm = NULL;
3592     cusp->cooPerm_a = NULL;
3593   }
3594   if (n) {
3595     THRUSTINTARRAY d_i(n);
3596     THRUSTINTARRAY d_j(n);
3597     THRUSTINTARRAY ii(A->rmap->n);
3598 
3599     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3600     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3601 
3602     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3603     d_i.assign(coo_i,coo_i+n);
3604     d_j.assign(coo_j,coo_j+n);
3605     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3606     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3607 
3608     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3609     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3610     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3611     *cusp->cooPerm_a = d_i;
3612     THRUSTINTARRAY w = d_j;
3613 
3614     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3615     if (nekey == ekey) { /* all entries are unique */
3616       delete cusp->cooPerm_a;
3617       cusp->cooPerm_a = NULL;
3618     } else { /* I couldn't come up with a more elegant algorithm */
3619       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3620       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3621       (*cusp->cooPerm_a)[0] = 0;
3622       w[0] = 0;
3623       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3624       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3625     }
3626     thrust::counting_iterator<PetscInt> search_begin(0);
3627     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3628                         search_begin, search_begin + A->rmap->n,
3629                         ii.begin());
3630     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3631     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3632 
3633     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3634     a->singlemalloc = PETSC_FALSE;
3635     a->free_a       = PETSC_TRUE;
3636     a->free_ij      = PETSC_TRUE;
3637     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3638     a->i[0] = 0;
3639     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3640     a->nz = a->maxnz = a->i[A->rmap->n];
3641     a->rmax = 0;
3642     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3643     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3644     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3645     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3646     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3647     for (PetscInt i = 0; i < A->rmap->n; i++) {
3648       const PetscInt nnzr = a->i[i+1] - a->i[i];
3649       nzr += (PetscInt)!!(nnzr);
3650       a->ilen[i] = a->imax[i] = nnzr;
3651       a->rmax = PetscMax(a->rmax,nnzr);
3652     }
3653     a->nonzerorowcnt = nzr;
3654     A->preallocated = PETSC_TRUE;
3655     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3656     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3657   } else {
3658     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3659   }
3660   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3661 
3662   /* We want to allocate the CUSPARSE struct for matvec now.
3663      The code is so convoluted now that I prefer to copy zeros */
3664   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3665   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3666   A->offloadmask = PETSC_OFFLOAD_CPU;
3667   A->nonzerostate++;
3668   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3669   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3670 
3671   A->assembled = PETSC_FALSE;
3672   A->was_assembled = PETSC_FALSE;
3673   PetscFunctionReturn(0);
3674 }
3675 
3676 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3677 {
3678   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3679   CsrMatrix          *csr;
3680   PetscErrorCode     ierr;
3681 
3682   PetscFunctionBegin;
3683   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3684   PetscValidPointer(a,2);
3685   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3686   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3687   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3688   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3689   csr = (CsrMatrix*)cusp->mat->mat;
3690   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3691   *a = csr->values->data().get();
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3696 {
3697   PetscFunctionBegin;
3698   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3699   PetscValidPointer(a,2);
3700   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3701   *a = NULL;
3702   PetscFunctionReturn(0);
3703 }
3704 
3705 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3706 {
3707   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3708   CsrMatrix          *csr;
3709   PetscErrorCode     ierr;
3710 
3711   PetscFunctionBegin;
3712   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3713   PetscValidPointer(a,2);
3714   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3715   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3716   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3717   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3718   csr = (CsrMatrix*)cusp->mat->mat;
3719   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3720   *a = csr->values->data().get();
3721   A->offloadmask = PETSC_OFFLOAD_GPU;
3722   PetscFunctionReturn(0);
3723 }
3724 
3725 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3726 {
3727   PetscErrorCode ierr;
3728 
3729   PetscFunctionBegin;
3730   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3731   PetscValidPointer(a,2);
3732   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3733   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3734   *a = NULL;
3735   PetscFunctionReturn(0);
3736 }
3737 
3738 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3739 {
3740   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3741   CsrMatrix          *csr;
3742 
3743   PetscFunctionBegin;
3744   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3745   PetscValidPointer(a,2);
3746   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3747   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3748   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3749   csr = (CsrMatrix*)cusp->mat->mat;
3750   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3751   *a = csr->values->data().get();
3752   A->offloadmask = PETSC_OFFLOAD_GPU;
3753   PetscFunctionReturn(0);
3754 }
3755 
3756 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3757 {
3758   PetscErrorCode ierr;
3759 
3760   PetscFunctionBegin;
3761   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3762   PetscValidPointer(a,2);
3763   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3764   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3765   *a = NULL;
3766   PetscFunctionReturn(0);
3767 }
3768 
3769 struct IJCompare4
3770 {
3771   __host__ __device__
3772   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3773   {
3774     if (t1.get<0>() < t2.get<0>()) return true;
3775     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3776     return false;
3777   }
3778 };
3779 
3780 struct Shift
3781 {
3782   int _shift;
3783 
3784   Shift(int shift) : _shift(shift) {}
3785   __host__ __device__
3786   inline int operator() (const int &c)
3787   {
3788     return c + _shift;
3789   }
3790 };
3791 
3792 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3793 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3794 {
3795   PetscErrorCode               ierr;
3796   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3797   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3798   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3799   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3800   PetscInt                     Annz,Bnnz;
3801   cusparseStatus_t             stat;
3802   PetscInt                     i,m,n,zero = 0;
3803   cudaError_t                  cerr;
3804 
3805   PetscFunctionBegin;
3806   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3807   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3808   PetscValidPointer(C,4);
3809   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3810   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3811   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3812   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3813   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3814   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3815   if (reuse == MAT_INITIAL_MATRIX) {
3816     m     = A->rmap->n;
3817     n     = A->cmap->n + B->cmap->n;
3818     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3819     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3820     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3821     c     = (Mat_SeqAIJ*)(*C)->data;
3822     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3823     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3824     Ccsr  = new CsrMatrix;
3825     Cmat->cprowIndices      = NULL;
3826     c->compressedrow.use    = PETSC_FALSE;
3827     c->compressedrow.nrows  = 0;
3828     c->compressedrow.i      = NULL;
3829     c->compressedrow.rindex = NULL;
3830     Ccusp->workVector       = NULL;
3831     Ccusp->nrows    = m;
3832     Ccusp->mat      = Cmat;
3833     Ccusp->mat->mat = Ccsr;
3834     Ccsr->num_rows  = m;
3835     Ccsr->num_cols  = n;
3836     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3837     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3838     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3839     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3840     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3841     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3842     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3843     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3844     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3845     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3846     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3847     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3848     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3849     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3850     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3851 
3852     Acsr = (CsrMatrix*)Acusp->mat->mat;
3853     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3854     Annz = (PetscInt)Acsr->column_indices->size();
3855     Bnnz = (PetscInt)Bcsr->column_indices->size();
3856     c->nz = Annz + Bnnz;
3857     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3858     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3859     Ccsr->values = new THRUSTARRAY(c->nz);
3860     Ccsr->num_entries = c->nz;
3861     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3862     if (c->nz) {
3863       auto Acoo = new THRUSTINTARRAY32(Annz);
3864       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3865       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3866       THRUSTINTARRAY32 *Aroff,*Broff;
3867 
3868       if (a->compressedrow.use) { /* need full row offset */
3869         if (!Acusp->rowoffsets_gpu) {
3870           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3871           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3872           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3873         }
3874         Aroff = Acusp->rowoffsets_gpu;
3875       } else Aroff = Acsr->row_offsets;
3876       if (b->compressedrow.use) { /* need full row offset */
3877         if (!Bcusp->rowoffsets_gpu) {
3878           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3879           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3880           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3881         }
3882         Broff = Bcusp->rowoffsets_gpu;
3883       } else Broff = Bcsr->row_offsets;
3884       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3885       stat = cusparseXcsr2coo(Acusp->handle,
3886                               Aroff->data().get(),
3887                               Annz,
3888                               m,
3889                               Acoo->data().get(),
3890                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3891       stat = cusparseXcsr2coo(Bcusp->handle,
3892                               Broff->data().get(),
3893                               Bnnz,
3894                               m,
3895                               Bcoo->data().get(),
3896                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3897       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3898       auto Aperm = thrust::make_constant_iterator(1);
3899       auto Bperm = thrust::make_constant_iterator(0);
3900 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3901       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3902       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3903 #else
3904       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3905       auto Bcib = Bcsr->column_indices->begin();
3906       auto Bcie = Bcsr->column_indices->end();
3907       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3908 #endif
3909       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3910       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3911       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3912       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3913       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3914       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3915       auto p1 = Ccusp->cooPerm->begin();
3916       auto p2 = Ccusp->cooPerm->begin();
3917       thrust::advance(p2,Annz);
3918       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3919 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3920       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3921 #endif
3922       auto cci = thrust::make_counting_iterator(zero);
3923       auto cce = thrust::make_counting_iterator(c->nz);
3924 #if 0 //Errors on SUMMIT cuda 11.1.0
3925       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3926 #else
3927       auto pred = thrust::identity<int>();
3928       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
3929       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
3930 #endif
3931       stat = cusparseXcoo2csr(Ccusp->handle,
3932                               Ccoo->data().get(),
3933                               c->nz,
3934                               m,
3935                               Ccsr->row_offsets->data().get(),
3936                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3937       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3938       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3939       delete wPerm;
3940       delete Acoo;
3941       delete Bcoo;
3942       delete Ccoo;
3943 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3944       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3945                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3946                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3947                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3948 #endif
3949       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3950         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3951         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3952         CsrMatrix *CcsrT = new CsrMatrix;
3953         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3954         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3955 
3956         Ccusp->transgen = PETSC_TRUE;
3957         CmatT->cprowIndices  = NULL;
3958         CmatT->mat = CcsrT;
3959         CcsrT->num_rows = n;
3960         CcsrT->num_cols = m;
3961         CcsrT->num_entries = c->nz;
3962 
3963         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3964         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3965         CcsrT->values = new THRUSTARRAY(c->nz);
3966 
3967         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3968         auto rT = CcsrT->row_offsets->begin();
3969         if (AT) {
3970           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3971           thrust::advance(rT,-1);
3972         }
3973         if (BT) {
3974           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3975           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3976           thrust::copy(titb,tite,rT);
3977         }
3978         auto cT = CcsrT->column_indices->begin();
3979         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3980         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3981         auto vT = CcsrT->values->begin();
3982         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3983         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3984         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3985         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3986 
3987         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3988         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3989         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3990         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3991         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3992         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3993         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3994         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3995         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3996 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3997         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3998                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3999                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4000                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4001 #endif
4002         Ccusp->matTranspose = CmatT;
4003       }
4004     }
4005 
4006     c->singlemalloc = PETSC_FALSE;
4007     c->free_a       = PETSC_TRUE;
4008     c->free_ij      = PETSC_TRUE;
4009     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4010     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4011     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4012       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4013       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4014       ii   = *Ccsr->row_offsets;
4015       jj   = *Ccsr->column_indices;
4016       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4017       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4018     } else {
4019       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4020       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4021     }
4022     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4023     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4024     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4025     c->maxnz = c->nz;
4026     c->nonzerorowcnt = 0;
4027     c->rmax = 0;
4028     for (i = 0; i < m; i++) {
4029       const PetscInt nn = c->i[i+1] - c->i[i];
4030       c->ilen[i] = c->imax[i] = nn;
4031       c->nonzerorowcnt += (PetscInt)!!nn;
4032       c->rmax = PetscMax(c->rmax,nn);
4033     }
4034     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4035     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4036     (*C)->nonzerostate++;
4037     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4038     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4039     Ccusp->nonzerostate = (*C)->nonzerostate;
4040     (*C)->preallocated  = PETSC_TRUE;
4041   } else {
4042     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4043     c = (Mat_SeqAIJ*)(*C)->data;
4044     if (c->nz) {
4045       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4046       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4047       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4048       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4049       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4050       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4051       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4052       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4053       Acsr = (CsrMatrix*)Acusp->mat->mat;
4054       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4055       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4056       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4057       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4058       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4059       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4060       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4061       auto pmid = Ccusp->cooPerm->begin();
4062       thrust::advance(pmid,Acsr->num_entries);
4063       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4064       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4065                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4066       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4067                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4068       thrust::for_each(zibait,zieait,VecCUDAEquals());
4069       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4070                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4071       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4072                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4073       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4074       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4075         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4076         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4077         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4078         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4079         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4080         auto vT = CcsrT->values->begin();
4081         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4082         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4083       }
4084       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4085       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4086     }
4087   }
4088   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4089   (*C)->assembled     = PETSC_TRUE;
4090   (*C)->was_assembled = PETSC_FALSE;
4091   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4092   PetscFunctionReturn(0);
4093 }
4094 
4095 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4096 {
4097   PetscErrorCode    ierr;
4098   bool              dmem;
4099   const PetscScalar *av;
4100   cudaError_t       cerr;
4101 
4102   PetscFunctionBegin;
4103   dmem = isCudaMem(v);
4104   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4105   if (n && idx) {
4106     THRUSTINTARRAY widx(n);
4107     widx.assign(idx,idx+n);
4108     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4109 
4110     THRUSTARRAY *w = NULL;
4111     thrust::device_ptr<PetscScalar> dv;
4112     if (dmem) {
4113       dv = thrust::device_pointer_cast(v);
4114     } else {
4115       w = new THRUSTARRAY(n);
4116       dv = w->data();
4117     }
4118     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4119 
4120     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4121     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4122     thrust::for_each(zibit,zieit,VecCUDAEquals());
4123     if (w) {
4124       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4125     }
4126     delete w;
4127   } else {
4128     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4129   }
4130   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4131   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4132   PetscFunctionReturn(0);
4133 }
4134