xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision c215019ae02e8adaf4e6a01b1f07a8650fb5f99a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 
86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
88 
89 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
90 
91 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
92 {
93   cusparseStatus_t   stat;
94   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
95 
96   PetscFunctionBegin;
97   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
98   cusparsestruct->stream = stream;
99   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
100   PetscFunctionReturn(0);
101 }
102 
103 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
104 {
105   cusparseStatus_t   stat;
106   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
107 
108   PetscFunctionBegin;
109   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
110   if (cusparsestruct->handle != handle) {
111     if (cusparsestruct->handle) {
112       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
113     }
114     cusparsestruct->handle = handle;
115   }
116   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
117   PetscFunctionReturn(0);
118 }
119 
120 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
121 {
122   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
123   PetscBool          flg;
124   PetscErrorCode     ierr;
125 
126   PetscFunctionBegin;
127   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
128   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
129   if (cusparsestruct->handle) cusparsestruct->handle = 0;
130   PetscFunctionReturn(0);
131 }
132 
133 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
134 {
135   PetscFunctionBegin;
136   *type = MATSOLVERCUSPARSE;
137   PetscFunctionReturn(0);
138 }
139 
140 /*MC
141   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
142   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
143   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
144   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
145   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
146   algorithms are not recommended. This class does NOT support direct solver operations.
147 
148   Level: beginner
149 
150 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
151 M*/
152 
153 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
154 {
155   PetscErrorCode ierr;
156   PetscInt       n = A->rmap->n;
157 
158   PetscFunctionBegin;
159   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
160   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
161   (*B)->factortype = ftype;
162   (*B)->useordering = PETSC_TRUE;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
170     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
171     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
172   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
173 
174   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
175   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
180 {
181   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
182 
183   PetscFunctionBegin;
184   switch (op) {
185   case MAT_CUSPARSE_MULT:
186     cusparsestruct->format = format;
187     break;
188   case MAT_CUSPARSE_ALL:
189     cusparsestruct->format = format;
190     break;
191   default:
192     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
193   }
194   PetscFunctionReturn(0);
195 }
196 
197 /*@
198    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
199    operation. Only the MatMult operation can use different GPU storage formats
200    for MPIAIJCUSPARSE matrices.
201    Not Collective
202 
203    Input Parameters:
204 +  A - Matrix of type SEQAIJCUSPARSE
205 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
206 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
207 
208    Output Parameter:
209 
210    Level: intermediate
211 
212 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
213 @*/
214 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
220   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
221   PetscFunctionReturn(0);
222 }
223 
224 /*@
225    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
226 
227    Collective on mat
228 
229    Input Parameters:
230 +  A - Matrix of type SEQAIJCUSPARSE
231 -  transgen - the boolean flag
232 
233    Level: intermediate
234 
235 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
236 @*/
237 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
238 {
239   PetscErrorCode ierr;
240   PetscBool      flg;
241 
242   PetscFunctionBegin;
243   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
244   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
245   if (flg) {
246     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
247 
248     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
249     cusp->transgen = transgen;
250     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
251       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
252     }
253   }
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
258 {
259   PetscErrorCode           ierr;
260   MatCUSPARSEStorageFormat format;
261   PetscBool                flg;
262   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
263 
264   PetscFunctionBegin;
265   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
266   if (A->factortype == MAT_FACTOR_NONE) {
267     PetscBool transgen = cusparsestruct->transgen;
268 
269     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
270     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
271 
272     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
273                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
274     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
275 
276     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
277                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
278     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
279    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
280     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
281     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
282                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
283     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
284     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
285 
286     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
287     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
288                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
289     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
290 
291     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
292     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
293                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
294     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
295    #endif
296   }
297   ierr = PetscOptionsTail();CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
302 {
303   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
304   PetscErrorCode               ierr;
305 
306   PetscFunctionBegin;
307   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
308   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
309   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
310   PetscFunctionReturn(0);
311 }
312 
313 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
314 {
315   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
316   PetscErrorCode               ierr;
317 
318   PetscFunctionBegin;
319   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
320   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
321   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326 {
327   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
328   PetscErrorCode               ierr;
329 
330   PetscFunctionBegin;
331   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
332   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
333   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
338 {
339   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
340   PetscErrorCode               ierr;
341 
342   PetscFunctionBegin;
343   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
344   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
345   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
346   PetscFunctionReturn(0);
347 }
348 
349 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
350 {
351   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
352   PetscInt                          n = A->rmap->n;
353   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
354   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
355   cusparseStatus_t                  stat;
356   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
357   const MatScalar                   *aa = a->a,*v;
358   PetscInt                          *AiLo, *AjLo;
359   PetscInt                          i,nz, nzLower, offset, rowOffset;
360   PetscErrorCode                    ierr;
361   cudaError_t                       cerr;
362 
363   PetscFunctionBegin;
364   if (!n) PetscFunctionReturn(0);
365   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
366     try {
367       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
368       nzLower=n+ai[n]-ai[1];
369       if (!loTriFactor) {
370         PetscScalar                       *AALo;
371 
372         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
373 
374         /* Allocate Space for the lower triangular matrix */
375         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
376         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
377 
378         /* Fill the lower triangular matrix */
379         AiLo[0]  = (PetscInt) 0;
380         AiLo[n]  = nzLower;
381         AjLo[0]  = (PetscInt) 0;
382         AALo[0]  = (MatScalar) 1.0;
383         v        = aa;
384         vi       = aj;
385         offset   = 1;
386         rowOffset= 1;
387         for (i=1; i<n; i++) {
388           nz = ai[i+1] - ai[i];
389           /* additional 1 for the term on the diagonal */
390           AiLo[i]    = rowOffset;
391           rowOffset += nz+1;
392 
393           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
394           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
395 
396           offset      += nz;
397           AjLo[offset] = (PetscInt) i;
398           AALo[offset] = (MatScalar) 1.0;
399           offset      += 1;
400 
401           v  += nz;
402           vi += nz;
403         }
404 
405         /* allocate space for the triangular factor information */
406         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
407         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
408         /* Create the matrix description */
409         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
410         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
411        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
413        #else
414         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
415        #endif
416         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
417         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
418 
419         /* set the operation */
420         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421 
422         /* set the matrix */
423         loTriFactor->csrMat = new CsrMatrix;
424         loTriFactor->csrMat->num_rows = n;
425         loTriFactor->csrMat->num_cols = n;
426         loTriFactor->csrMat->num_entries = nzLower;
427 
428         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
430 
431         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
432         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
433 
434         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
435         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
436 
437         /* Create the solve analysis information */
438         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
439         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
440       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
441         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
442                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
443                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
444                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
445                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
446         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
447       #endif
448 
449         /* perform the solve analysis */
450         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
451                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
452                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
453                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
454                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
455                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
456                                #endif
457 );CHKERRCUSPARSE(stat);
458         cerr = WaitForCUDA();CHKERRCUDA(cerr);
459         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
460 
461         /* assign the pointer */
462         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
463         loTriFactor->AA_h = AALo;
464         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
465         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
466         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
467       } else { /* update values only */
468         if (!loTriFactor->AA_h) {
469           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
470         }
471         /* Fill the lower triangular matrix */
472         loTriFactor->AA_h[0]  = 1.0;
473         v        = aa;
474         vi       = aj;
475         offset   = 1;
476         for (i=1; i<n; i++) {
477           nz = ai[i+1] - ai[i];
478           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
479           offset      += nz;
480           loTriFactor->AA_h[offset] = 1.0;
481           offset      += 1;
482           v  += nz;
483         }
484         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
485         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
486       }
487     } catch(char *ex) {
488       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
489     }
490   }
491   PetscFunctionReturn(0);
492 }
493 
494 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
495 {
496   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
497   PetscInt                          n = A->rmap->n;
498   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
499   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
500   cusparseStatus_t                  stat;
501   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
502   const MatScalar                   *aa = a->a,*v;
503   PetscInt                          *AiUp, *AjUp;
504   PetscInt                          i,nz, nzUpper, offset;
505   PetscErrorCode                    ierr;
506   cudaError_t                       cerr;
507 
508   PetscFunctionBegin;
509   if (!n) PetscFunctionReturn(0);
510   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
511     try {
512       /* next, figure out the number of nonzeros in the upper triangular matrix. */
513       nzUpper = adiag[0]-adiag[n];
514       if (!upTriFactor) {
515         PetscScalar *AAUp;
516 
517         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
518 
519         /* Allocate Space for the upper triangular matrix */
520         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
521         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
522 
523         /* Fill the upper triangular matrix */
524         AiUp[0]=(PetscInt) 0;
525         AiUp[n]=nzUpper;
526         offset = nzUpper;
527         for (i=n-1; i>=0; i--) {
528           v  = aa + adiag[i+1] + 1;
529           vi = aj + adiag[i+1] + 1;
530 
531           /* number of elements NOT on the diagonal */
532           nz = adiag[i] - adiag[i+1]-1;
533 
534           /* decrement the offset */
535           offset -= (nz+1);
536 
537           /* first, set the diagonal elements */
538           AjUp[offset] = (PetscInt) i;
539           AAUp[offset] = (MatScalar)1./v[nz];
540           AiUp[i]      = AiUp[i+1] - (nz+1);
541 
542           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
543           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
544         }
545 
546         /* allocate space for the triangular factor information */
547         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
548         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
549 
550         /* Create the matrix description */
551         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
552         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
553        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
555        #else
556         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
557        #endif
558         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
559         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
560 
561         /* set the operation */
562         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
563 
564         /* set the matrix */
565         upTriFactor->csrMat = new CsrMatrix;
566         upTriFactor->csrMat->num_rows = n;
567         upTriFactor->csrMat->num_cols = n;
568         upTriFactor->csrMat->num_entries = nzUpper;
569 
570         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
571         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
572 
573         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
574         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
575 
576         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
577         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
578 
579         /* Create the solve analysis information */
580         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
581         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
582       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
583         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
584                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
585                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
586                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
587                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
588         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
589       #endif
590 
591         /* perform the solve analysis */
592         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
596                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
597                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
598                                #endif
599 );CHKERRCUSPARSE(stat);
600         cerr = WaitForCUDA();CHKERRCUDA(cerr);
601         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602 
603         /* assign the pointer */
604         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
605         upTriFactor->AA_h = AAUp;
606         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
607         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
608         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
609       } else {
610         if (!upTriFactor->AA_h) {
611           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
612         }
613         /* Fill the upper triangular matrix */
614         offset = nzUpper;
615         for (i=n-1; i>=0; i--) {
616           v  = aa + adiag[i+1] + 1;
617 
618           /* number of elements NOT on the diagonal */
619           nz = adiag[i] - adiag[i+1]-1;
620 
621           /* decrement the offset */
622           offset -= (nz+1);
623 
624           /* first, set the diagonal elements */
625           upTriFactor->AA_h[offset] = 1./v[nz];
626           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
627         }
628         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
629         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
630       }
631     } catch(char *ex) {
632       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
633     }
634   }
635   PetscFunctionReturn(0);
636 }
637 
638 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
639 {
640   PetscErrorCode               ierr;
641   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
642   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
643   IS                           isrow = a->row,iscol = a->icol;
644   PetscBool                    row_identity,col_identity;
645   PetscInt                     n = A->rmap->n;
646 
647   PetscFunctionBegin;
648   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
649   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
650   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
651 
652   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
653   cusparseTriFactors->nnz=a->nz;
654 
655   A->offloadmask = PETSC_OFFLOAD_BOTH;
656   /* lower triangular indices */
657   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
658   if (!row_identity && !cusparseTriFactors->rpermIndices) {
659     const PetscInt *r;
660 
661     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
662     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->rpermIndices->assign(r, r+n);
664     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
665     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666   }
667 
668   /* upper triangular indices */
669   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
670   if (!col_identity && !cusparseTriFactors->cpermIndices) {
671     const PetscInt *c;
672 
673     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
674     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->cpermIndices->assign(c, c+n);
676     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
677     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
678   }
679   PetscFunctionReturn(0);
680 }
681 
682 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
683 {
684   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
685   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
686   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
687   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
688   cusparseStatus_t                  stat;
689   PetscErrorCode                    ierr;
690   cudaError_t                       cerr;
691   PetscInt                          *AiUp, *AjUp;
692   PetscScalar                       *AAUp;
693   PetscScalar                       *AALo;
694   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
695   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
696   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
697   const MatScalar                   *aa = b->a,*v;
698 
699   PetscFunctionBegin;
700   if (!n) PetscFunctionReturn(0);
701   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
702     try {
703       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
704       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705       if (!upTriFactor && !loTriFactor) {
706         /* Allocate Space for the upper triangular matrix */
707         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
708         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
709 
710         /* Fill the upper triangular matrix */
711         AiUp[0]=(PetscInt) 0;
712         AiUp[n]=nzUpper;
713         offset = 0;
714         for (i=0; i<n; i++) {
715           /* set the pointers */
716           v  = aa + ai[i];
717           vj = aj + ai[i];
718           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
719 
720           /* first, set the diagonal elements */
721           AjUp[offset] = (PetscInt) i;
722           AAUp[offset] = (MatScalar)1.0/v[nz];
723           AiUp[i]      = offset;
724           AALo[offset] = (MatScalar)1.0/v[nz];
725 
726           offset+=1;
727           if (nz>0) {
728             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
729             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
730             for (j=offset; j<offset+nz; j++) {
731               AAUp[j] = -AAUp[j];
732               AALo[j] = AAUp[j]/v[nz];
733             }
734             offset+=nz;
735           }
736         }
737 
738         /* allocate space for the triangular factor information */
739         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
740         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
741 
742         /* Create the matrix description */
743         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
744         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
745        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
747        #else
748         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
749        #endif
750         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
751         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
752 
753         /* set the matrix */
754         upTriFactor->csrMat = new CsrMatrix;
755         upTriFactor->csrMat->num_rows = A->rmap->n;
756         upTriFactor->csrMat->num_cols = A->cmap->n;
757         upTriFactor->csrMat->num_entries = a->nz;
758 
759         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
760         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
761 
762         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
763         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
764 
765         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
766         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
767 
768         /* set the operation */
769         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
770 
771         /* Create the solve analysis information */
772         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
773         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
774       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
775         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
776                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
777                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
778                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
779                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
780         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
781       #endif
782 
783         /* perform the solve analysis */
784         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
785                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
786                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
787                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
788                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
789                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
790                                 #endif
791 );CHKERRCUSPARSE(stat);
792         cerr = WaitForCUDA();CHKERRCUDA(cerr);
793         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
794 
795         /* assign the pointer */
796         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
797 
798         /* allocate space for the triangular factor information */
799         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
800         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
801 
802         /* Create the matrix description */
803         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
804         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
805        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
807        #else
808         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
809        #endif
810         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
811         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
812 
813         /* set the operation */
814         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
815 
816         /* set the matrix */
817         loTriFactor->csrMat = new CsrMatrix;
818         loTriFactor->csrMat->num_rows = A->rmap->n;
819         loTriFactor->csrMat->num_cols = A->cmap->n;
820         loTriFactor->csrMat->num_entries = a->nz;
821 
822         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
823         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
824 
825         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
826         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
827 
828         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
829         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
830 
831         /* Create the solve analysis information */
832         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
833         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
834       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
835         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
836                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
837                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
839                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
840         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
841       #endif
842 
843         /* perform the solve analysis */
844         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
845                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
846                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
847                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
848                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
849                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
850                                 #endif
851 );CHKERRCUSPARSE(stat);
852         cerr = WaitForCUDA();CHKERRCUDA(cerr);
853         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854 
855         /* assign the pointer */
856         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
857 
858         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
859         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
860         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
861       } else {
862         /* Fill the upper triangular matrix */
863         offset = 0;
864         for (i=0; i<n; i++) {
865           /* set the pointers */
866           v  = aa + ai[i];
867           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
868 
869           /* first, set the diagonal elements */
870           AAUp[offset] = 1.0/v[nz];
871           AALo[offset] = 1.0/v[nz];
872 
873           offset+=1;
874           if (nz>0) {
875             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
876             for (j=offset; j<offset+nz; j++) {
877               AAUp[j] = -AAUp[j];
878               AALo[j] = AAUp[j]/v[nz];
879             }
880             offset+=nz;
881           }
882         }
883         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
884         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
886         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
887         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
888       }
889       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
890       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
891     } catch(char *ex) {
892       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
893     }
894   }
895   PetscFunctionReturn(0);
896 }
897 
898 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
899 {
900   PetscErrorCode               ierr;
901   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
902   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
903   IS                           ip = a->row;
904   PetscBool                    perm_identity;
905   PetscInt                     n = A->rmap->n;
906 
907   PetscFunctionBegin;
908   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
910   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
911   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
912 
913   A->offloadmask = PETSC_OFFLOAD_BOTH;
914 
915   /* lower triangular indices */
916   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
917   if (!perm_identity) {
918     IS             iip;
919     const PetscInt *irip,*rip;
920 
921     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
922     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
923     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
924     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
926     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
927     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
928     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
929     ierr = ISDestroy(&iip);CHKERRQ(ierr);
930     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
931     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
937 {
938   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
939   IS             isrow = b->row,iscol = b->col;
940   PetscBool      row_identity,col_identity;
941   PetscErrorCode ierr;
942 
943   PetscFunctionBegin;
944   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
945   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
946   B->offloadmask = PETSC_OFFLOAD_CPU;
947   /* determine which version of MatSolve needs to be used. */
948   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
949   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
950   if (row_identity && col_identity) {
951     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
952     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
953     B->ops->matsolve = NULL;
954     B->ops->matsolvetranspose = NULL;
955   } else {
956     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
957     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
958     B->ops->matsolve = NULL;
959     B->ops->matsolvetranspose = NULL;
960   }
961 
962   /* get the triangular factors */
963   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
964   PetscFunctionReturn(0);
965 }
966 
967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968 {
969   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970   IS             ip = b->row;
971   PetscBool      perm_identity;
972   PetscErrorCode ierr;
973 
974   PetscFunctionBegin;
975   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977   B->offloadmask = PETSC_OFFLOAD_CPU;
978   /* determine which version of MatSolve needs to be used. */
979   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980   if (perm_identity) {
981     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->matsolve = NULL;
984     B->ops->matsolvetranspose = NULL;
985   } else {
986     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988     B->ops->matsolve = NULL;
989     B->ops->matsolvetranspose = NULL;
990   }
991 
992   /* get the triangular factors */
993   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998 {
999   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004   cusparseStatus_t                  stat;
1005   cusparseIndexBase_t               indexBase;
1006   cusparseMatrixType_t              matrixType;
1007   cusparseFillMode_t                fillMode;
1008   cusparseDiagType_t                diagType;
1009   cudaError_t                       cerr;
1010   PetscErrorCode                    ierr;
1011 
1012   PetscFunctionBegin;
1013   /* allocate space for the transpose of the lower triangular factor */
1014   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016 
1017   /* set the matrix descriptors of the lower triangular factor */
1018   matrixType = cusparseGetMatType(loTriFactor->descr);
1019   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023 
1024   /* Create the matrix description */
1025   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030 
1031   /* set the operation */
1032   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033 
1034   /* allocate GPU space for the CSC of the lower triangular factor*/
1035   loTriFactorT->csrMat = new CsrMatrix;
1036   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042 
1043   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                        loTriFactor->csrMat->values->data().get(),
1048                                        loTriFactor->csrMat->row_offsets->data().get(),
1049                                        loTriFactor->csrMat->column_indices->data().get(),
1050                                        loTriFactorT->csrMat->values->data().get(),
1051                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055 #endif
1056 
1057   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060                           loTriFactor->csrMat->values->data().get(),
1061                           loTriFactor->csrMat->row_offsets->data().get(),
1062                           loTriFactor->csrMat->column_indices->data().get(),
1063                           loTriFactorT->csrMat->values->data().get(),
1064                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068                         #else
1069                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070                           CUSPARSE_ACTION_NUMERIC, indexBase
1071                         #endif
1072 );CHKERRCUSPARSE(stat);
1073   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075 
1076   /* Create the solve analysis information */
1077   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086 #endif
1087 
1088   /* perform the solve analysis */
1089   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095                           #endif
1096 );CHKERRCUSPARSE(stat);
1097   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099 
1100   /* assign the pointer */
1101   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102 
1103   /*********************************************/
1104   /* Now the Transpose of the Upper Tri Factor */
1105   /*********************************************/
1106 
1107   /* allocate space for the transpose of the upper triangular factor */
1108   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110 
1111   /* set the matrix descriptors of the upper triangular factor */
1112   matrixType = cusparseGetMatType(upTriFactor->descr);
1113   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117 
1118   /* Create the matrix description */
1119   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124 
1125   /* set the operation */
1126   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127 
1128   /* allocate GPU space for the CSC of the upper triangular factor*/
1129   upTriFactorT->csrMat = new CsrMatrix;
1130   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136 
1137   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                 upTriFactor->csrMat->values->data().get(),
1142                                 upTriFactor->csrMat->row_offsets->data().get(),
1143                                 upTriFactor->csrMat->column_indices->data().get(),
1144                                 upTriFactorT->csrMat->values->data().get(),
1145                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149 #endif
1150 
1151   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154                           upTriFactor->csrMat->values->data().get(),
1155                           upTriFactor->csrMat->row_offsets->data().get(),
1156                           upTriFactor->csrMat->column_indices->data().get(),
1157                           upTriFactorT->csrMat->values->data().get(),
1158                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162                         #else
1163                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                           CUSPARSE_ACTION_NUMERIC, indexBase
1165                         #endif
1166 );CHKERRCUSPARSE(stat);
1167   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169 
1170   /* Create the solve analysis information */
1171   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180   #endif
1181 
1182   /* perform the solve analysis */
1183   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189                           #endif
1190 );CHKERRCUSPARSE(stat);
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1200 {
1201   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1204   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205   cusparseStatus_t             stat;
1206   cusparseIndexBase_t          indexBase;
1207   cudaError_t                  err;
1208   PetscErrorCode               ierr;
1209 
1210   PetscFunctionBegin;
1211   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1212   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1213   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1214   /* create cusparse matrix */
1215   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1216   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1217   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1218   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1219   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1220 
1221   /* set alpha and beta */
1222   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1223   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1224   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1225   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1226   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1227   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1229 
1230   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1231     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1232     CsrMatrix *matrixT= new CsrMatrix;
1233     matrixT->num_rows = A->cmap->n;
1234     matrixT->num_cols = A->rmap->n;
1235     matrixT->num_entries = a->nz;
1236     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1237     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1238     matrixT->values = new THRUSTARRAY(a->nz);
1239 
1240     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1241     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1242 
1243     /* compute the transpose, i.e. the CSC */
1244    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1246                                   A->cmap->n, matrix->num_entries,
1247                                   matrix->values->data().get(),
1248                                   cusparsestruct->rowoffsets_gpu->data().get(),
1249                                   matrix->column_indices->data().get(),
1250                                   matrixT->values->data().get(),
1251                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1252                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1253                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1254     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1255    #endif
1256 
1257     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1258                             A->cmap->n, matrix->num_entries,
1259                             matrix->values->data().get(),
1260                             cusparsestruct->rowoffsets_gpu->data().get(),
1261                             matrix->column_indices->data().get(),
1262                             matrixT->values->data().get(),
1263                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1264                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1265                             CUSPARSE_ACTION_NUMERIC,indexBase,
1266                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1267                           #else
1268                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1269                             CUSPARSE_ACTION_NUMERIC, indexBase
1270                           #endif
1271 );CHKERRCUSPARSE(stat);
1272     matstructT->mat = matrixT;
1273 
1274    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1275     stat = cusparseCreateCsr(&matstructT->matDescr,
1276                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278                              matrixT->values->data().get(),
1279                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1280                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1281    #endif
1282   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1283    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1284     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1285    #else
1286     CsrMatrix *temp  = new CsrMatrix;
1287     CsrMatrix *tempT = new CsrMatrix;
1288     /* First convert HYB to CSR */
1289     temp->num_rows = A->rmap->n;
1290     temp->num_cols = A->cmap->n;
1291     temp->num_entries = a->nz;
1292     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1294     temp->values = new THRUSTARRAY(a->nz);
1295 
1296     stat = cusparse_hyb2csr(cusparsestruct->handle,
1297                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1298                             temp->values->data().get(),
1299                             temp->row_offsets->data().get(),
1300                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1301 
1302     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303     tempT->num_rows = A->rmap->n;
1304     tempT->num_cols = A->cmap->n;
1305     tempT->num_entries = a->nz;
1306     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1307     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308     tempT->values = new THRUSTARRAY(a->nz);
1309 
1310     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1311                             temp->num_cols, temp->num_entries,
1312                             temp->values->data().get(),
1313                             temp->row_offsets->data().get(),
1314                             temp->column_indices->data().get(),
1315                             tempT->values->data().get(),
1316                             tempT->column_indices->data().get(),
1317                             tempT->row_offsets->data().get(),
1318                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1319 
1320     /* Last, convert CSC to HYB */
1321     cusparseHybMat_t hybMat;
1322     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1323     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1324       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1325     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1326                             matstructT->descr, tempT->values->data().get(),
1327                             tempT->row_offsets->data().get(),
1328                             tempT->column_indices->data().get(),
1329                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1330 
1331     /* assign the pointer */
1332     matstructT->mat = hybMat;
1333     /* delete temporaries */
1334     if (tempT) {
1335       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1336       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1337       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1338       delete (CsrMatrix*) tempT;
1339     }
1340     if (temp) {
1341       if (temp->values) delete (THRUSTARRAY*) temp->values;
1342       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1343       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1344       delete (CsrMatrix*) temp;
1345     }
1346    #endif
1347   }
1348   err  = WaitForCUDA();CHKERRCUDA(err);
1349   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1350   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1351   /* the compressed row indices is not used for matTranspose */
1352   matstructT->cprowIndices = NULL;
1353   /* assign the pointer */
1354   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1359 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1360 {
1361   PetscInt                              n = xx->map->n;
1362   const PetscScalar                     *barray;
1363   PetscScalar                           *xarray;
1364   thrust::device_ptr<const PetscScalar> bGPU;
1365   thrust::device_ptr<PetscScalar>       xGPU;
1366   cusparseStatus_t                      stat;
1367   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1368   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1369   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1370   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1371   PetscErrorCode                        ierr;
1372   cudaError_t                           cerr;
1373 
1374   PetscFunctionBegin;
1375   /* Analyze the matrix and create the transpose ... on the fly */
1376   if (!loTriFactorT && !upTriFactorT) {
1377     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1378     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1379     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1380   }
1381 
1382   /* Get the GPU pointers */
1383   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1384   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1385   xGPU = thrust::device_pointer_cast(xarray);
1386   bGPU = thrust::device_pointer_cast(barray);
1387 
1388   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1389   /* First, reorder with the row permutation */
1390   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1391                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1392                xGPU);
1393 
1394   /* First, solve U */
1395   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1396                         upTriFactorT->csrMat->num_rows,
1397                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1398                         upTriFactorT->csrMat->num_entries,
1399                       #endif
1400                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1401                         upTriFactorT->csrMat->values->data().get(),
1402                         upTriFactorT->csrMat->row_offsets->data().get(),
1403                         upTriFactorT->csrMat->column_indices->data().get(),
1404                         upTriFactorT->solveInfo,
1405                         xarray, tempGPU->data().get()
1406                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1407                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1408                       #endif
1409 );CHKERRCUSPARSE(stat);
1410 
1411   /* Then, solve L */
1412   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1413                         loTriFactorT->csrMat->num_rows,
1414                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1415                         loTriFactorT->csrMat->num_entries,
1416                       #endif
1417                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1418                         loTriFactorT->csrMat->values->data().get(),
1419                         loTriFactorT->csrMat->row_offsets->data().get(),
1420                         loTriFactorT->csrMat->column_indices->data().get(),
1421                         loTriFactorT->solveInfo,
1422                         tempGPU->data().get(), xarray
1423                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1424                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1425                       #endif
1426 );CHKERRCUSPARSE(stat);
1427 
1428   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1429   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1430                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1431                tempGPU->begin());
1432 
1433   /* Copy the temporary to the full solution. */
1434   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1435 
1436   /* restore */
1437   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1438   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1440   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1441   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1446 {
1447   const PetscScalar                 *barray;
1448   PetscScalar                       *xarray;
1449   cusparseStatus_t                  stat;
1450   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1451   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1452   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1453   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1454   PetscErrorCode                    ierr;
1455   cudaError_t                       cerr;
1456 
1457   PetscFunctionBegin;
1458   /* Analyze the matrix and create the transpose ... on the fly */
1459   if (!loTriFactorT && !upTriFactorT) {
1460     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1461     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1462     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1463   }
1464 
1465   /* Get the GPU pointers */
1466   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1467   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1468 
1469   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1470   /* First, solve U */
1471   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1472                         upTriFactorT->csrMat->num_rows,
1473                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1474                         upTriFactorT->csrMat->num_entries,
1475                       #endif
1476                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1477                         upTriFactorT->csrMat->values->data().get(),
1478                         upTriFactorT->csrMat->row_offsets->data().get(),
1479                         upTriFactorT->csrMat->column_indices->data().get(),
1480                         upTriFactorT->solveInfo,
1481                         barray, tempGPU->data().get()
1482                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1483                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1484                       #endif
1485 );CHKERRCUSPARSE(stat);
1486 
1487   /* Then, solve L */
1488   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1489                         loTriFactorT->csrMat->num_rows,
1490                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1491                         loTriFactorT->csrMat->num_entries,
1492                       #endif
1493                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1494                         loTriFactorT->csrMat->values->data().get(),
1495                         loTriFactorT->csrMat->row_offsets->data().get(),
1496                         loTriFactorT->csrMat->column_indices->data().get(),
1497                         loTriFactorT->solveInfo,
1498                         tempGPU->data().get(), xarray
1499                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1500                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1501                       #endif
1502 );CHKERRCUSPARSE(stat);
1503 
1504   /* restore */
1505   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1506   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1507   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1508   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1509   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1514 {
1515   const PetscScalar                     *barray;
1516   PetscScalar                           *xarray;
1517   thrust::device_ptr<const PetscScalar> bGPU;
1518   thrust::device_ptr<PetscScalar>       xGPU;
1519   cusparseStatus_t                      stat;
1520   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1521   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1522   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1523   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1524   PetscErrorCode                        ierr;
1525   cudaError_t                           cerr;
1526 
1527   PetscFunctionBegin;
1528 
1529   /* Get the GPU pointers */
1530   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1531   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1532   xGPU = thrust::device_pointer_cast(xarray);
1533   bGPU = thrust::device_pointer_cast(barray);
1534 
1535   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1536   /* First, reorder with the row permutation */
1537   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1538                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1539                tempGPU->begin());
1540 
1541   /* Next, solve L */
1542   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1543                         loTriFactor->csrMat->num_rows,
1544                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545                         loTriFactor->csrMat->num_entries,
1546                       #endif
1547                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1548                         loTriFactor->csrMat->values->data().get(),
1549                         loTriFactor->csrMat->row_offsets->data().get(),
1550                         loTriFactor->csrMat->column_indices->data().get(),
1551                         loTriFactor->solveInfo,
1552                         tempGPU->data().get(), xarray
1553                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1555                       #endif
1556 );CHKERRCUSPARSE(stat);
1557 
1558   /* Then, solve U */
1559   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1560                         upTriFactor->csrMat->num_rows,
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         upTriFactor->csrMat->num_entries,
1563                       #endif
1564                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1565                         upTriFactor->csrMat->values->data().get(),
1566                         upTriFactor->csrMat->row_offsets->data().get(),
1567                         upTriFactor->csrMat->column_indices->data().get(),
1568                         upTriFactor->solveInfo,
1569                         xarray, tempGPU->data().get()
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1572                       #endif
1573 );CHKERRCUSPARSE(stat);
1574 
1575   /* Last, reorder with the column permutation */
1576   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1577                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1578                xGPU);
1579 
1580   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1581   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1582   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1583   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1584   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1589 {
1590   const PetscScalar                 *barray;
1591   PetscScalar                       *xarray;
1592   cusparseStatus_t                  stat;
1593   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1594   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1595   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1596   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1597   PetscErrorCode                    ierr;
1598   cudaError_t                       cerr;
1599 
1600   PetscFunctionBegin;
1601   /* Get the GPU pointers */
1602   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1603   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1604 
1605   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1606   /* First, solve L */
1607   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1608                         loTriFactor->csrMat->num_rows,
1609                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610                         loTriFactor->csrMat->num_entries,
1611                       #endif
1612                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1613                         loTriFactor->csrMat->values->data().get(),
1614                         loTriFactor->csrMat->row_offsets->data().get(),
1615                         loTriFactor->csrMat->column_indices->data().get(),
1616                         loTriFactor->solveInfo,
1617                         barray, tempGPU->data().get()
1618                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1620                       #endif
1621 );CHKERRCUSPARSE(stat);
1622 
1623   /* Next, solve U */
1624   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1625                         upTriFactor->csrMat->num_rows,
1626                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1627                         upTriFactor->csrMat->num_entries,
1628                       #endif
1629                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1630                         upTriFactor->csrMat->values->data().get(),
1631                         upTriFactor->csrMat->row_offsets->data().get(),
1632                         upTriFactor->csrMat->column_indices->data().get(),
1633                         upTriFactor->solveInfo,
1634                         tempGPU->data().get(), xarray
1635                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1636                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1637                       #endif
1638 );CHKERRCUSPARSE(stat);
1639 
1640   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1641   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1642   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1643   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1644   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1649 {
1650   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1651   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1652   cudaError_t        cerr;
1653   PetscErrorCode     ierr;
1654 
1655   PetscFunctionBegin;
1656   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1657     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1658 
1659     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1660     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1661     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1662     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1663     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1664     A->offloadmask = PETSC_OFFLOAD_BOTH;
1665   }
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1670 {
1671   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1672   PetscErrorCode ierr;
1673 
1674   PetscFunctionBegin;
1675   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1676   *array = a->a;
1677   A->offloadmask = PETSC_OFFLOAD_CPU;
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1682 {
1683   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1684   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1685   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1686   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1687   PetscErrorCode               ierr;
1688   cusparseStatus_t             stat;
1689   PetscBool                    both = PETSC_TRUE;
1690   cudaError_t                  err;
1691 
1692   PetscFunctionBegin;
1693   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1694   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1695     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1696       /* Copy values only */
1697       CsrMatrix *matrix,*matrixT;
1698       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1699 
1700       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1701       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1702       matrix->values->assign(a->a, a->a+a->nz);
1703       err  = WaitForCUDA();CHKERRCUDA(err);
1704       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1705       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1706 
1707       /* Update matT when it was built before */
1708       if (cusparsestruct->matTranspose) {
1709         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1710         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1711         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1712         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1713                             A->cmap->n, matrix->num_entries,
1714                             matrix->values->data().get(),
1715                             cusparsestruct->rowoffsets_gpu->data().get(),
1716                             matrix->column_indices->data().get(),
1717                             matrixT->values->data().get(),
1718                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1719                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1720                             CUSPARSE_ACTION_NUMERIC,indexBase,
1721                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1722                           #else
1723                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1724                             CUSPARSE_ACTION_NUMERIC, indexBase
1725                           #endif
1726 );CHKERRCUSPARSE(stat);
1727         err  = WaitForCUDA();CHKERRCUDA(err);
1728         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1729       }
1730     } else {
1731       PetscInt nnz;
1732       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1733       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1734       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1735       delete cusparsestruct->workVector;
1736       delete cusparsestruct->rowoffsets_gpu;
1737       try {
1738         if (a->compressedrow.use) {
1739           m    = a->compressedrow.nrows;
1740           ii   = a->compressedrow.i;
1741           ridx = a->compressedrow.rindex;
1742         } else {
1743           m    = A->rmap->n;
1744           ii   = a->i;
1745           ridx = NULL;
1746         }
1747         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1748         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1749         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1750         else nnz = a->nz;
1751 
1752         /* create cusparse matrix */
1753         cusparsestruct->nrows = m;
1754         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1755         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1756         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1757         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1758 
1759         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1760         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1761         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1762         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1763         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1764         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1765         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1766 
1767         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1768         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1769           /* set the matrix */
1770           CsrMatrix *mat= new CsrMatrix;
1771           mat->num_rows = m;
1772           mat->num_cols = A->cmap->n;
1773           mat->num_entries = nnz;
1774           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1775           mat->row_offsets->assign(ii, ii + m+1);
1776 
1777           mat->column_indices = new THRUSTINTARRAY32(nnz);
1778           mat->column_indices->assign(a->j, a->j+nnz);
1779 
1780           mat->values = new THRUSTARRAY(nnz);
1781           if (a->a) mat->values->assign(a->a, a->a+nnz);
1782 
1783           /* assign the pointer */
1784           matstruct->mat = mat;
1785          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1786           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1787             stat = cusparseCreateCsr(&matstruct->matDescr,
1788                                     mat->num_rows, mat->num_cols, mat->num_entries,
1789                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1790                                     mat->values->data().get(),
1791                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1792                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1793           }
1794          #endif
1795         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1796          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1797           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1798          #else
1799           CsrMatrix *mat= new CsrMatrix;
1800           mat->num_rows = m;
1801           mat->num_cols = A->cmap->n;
1802           mat->num_entries = nnz;
1803           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1804           mat->row_offsets->assign(ii, ii + m+1);
1805 
1806           mat->column_indices = new THRUSTINTARRAY32(nnz);
1807           mat->column_indices->assign(a->j, a->j+nnz);
1808 
1809           mat->values = new THRUSTARRAY(nnz);
1810           if (a->a) mat->values->assign(a->a, a->a+nnz);
1811 
1812           cusparseHybMat_t hybMat;
1813           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1814           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1815             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1816           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1817               matstruct->descr, mat->values->data().get(),
1818               mat->row_offsets->data().get(),
1819               mat->column_indices->data().get(),
1820               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1821           /* assign the pointer */
1822           matstruct->mat = hybMat;
1823 
1824           if (mat) {
1825             if (mat->values) delete (THRUSTARRAY*)mat->values;
1826             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1827             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1828             delete (CsrMatrix*)mat;
1829           }
1830          #endif
1831         }
1832 
1833         /* assign the compressed row indices */
1834         if (a->compressedrow.use) {
1835           cusparsestruct->workVector = new THRUSTARRAY(m);
1836           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1837           matstruct->cprowIndices->assign(ridx,ridx+m);
1838           tmp = m;
1839         } else {
1840           cusparsestruct->workVector = NULL;
1841           matstruct->cprowIndices    = NULL;
1842           tmp = 0;
1843         }
1844         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1845 
1846         /* assign the pointer */
1847         cusparsestruct->mat = matstruct;
1848       } catch(char *ex) {
1849         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1850       }
1851       err  = WaitForCUDA();CHKERRCUDA(err);
1852       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1853       cusparsestruct->nonzerostate = A->nonzerostate;
1854     }
1855     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1856   }
1857   PetscFunctionReturn(0);
1858 }
1859 
1860 struct VecCUDAPlusEquals
1861 {
1862   template <typename Tuple>
1863   __host__ __device__
1864   void operator()(Tuple t)
1865   {
1866     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1867   }
1868 };
1869 
1870 struct VecCUDAEquals
1871 {
1872   template <typename Tuple>
1873   __host__ __device__
1874   void operator()(Tuple t)
1875   {
1876     thrust::get<1>(t) = thrust::get<0>(t);
1877   }
1878 };
1879 
1880 struct VecCUDAEqualsReverse
1881 {
1882   template <typename Tuple>
1883   __host__ __device__
1884   void operator()(Tuple t)
1885   {
1886     thrust::get<0>(t) = thrust::get<1>(t);
1887   }
1888 };
1889 
1890 struct MatMatCusparse {
1891   PetscBool             cisdense;
1892   PetscScalar           *Bt;
1893   Mat                   X;
1894   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1895   PetscLogDouble        flops;
1896   CsrMatrix             *Bcsr;
1897 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898   cusparseSpMatDescr_t  matSpBDescr;
1899   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1900   cusparseDnMatDescr_t  matBDescr;
1901   cusparseDnMatDescr_t  matCDescr;
1902   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1903   size_t                mmBufferSize;
1904   void                  *mmBuffer;
1905   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1906   cusparseSpGEMMDescr_t spgemmDesc;
1907 #endif
1908 };
1909 
1910 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1911 {
1912   PetscErrorCode   ierr;
1913   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1914   cudaError_t      cerr;
1915  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1916   cusparseStatus_t stat;
1917  #endif
1918 
1919   PetscFunctionBegin;
1920   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1921   delete mmdata->Bcsr;
1922  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1923   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1924   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1925   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1926   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1927   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1928   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1929  #endif
1930   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1931   ierr = PetscFree(data);CHKERRQ(ierr);
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1936 
1937 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1938 {
1939   Mat_Product                  *product = C->product;
1940   Mat                          A,B;
1941   PetscInt                     m,n,blda,clda;
1942   PetscBool                    flg,biscuda;
1943   Mat_SeqAIJCUSPARSE           *cusp;
1944   cusparseStatus_t             stat;
1945   cusparseOperation_t          opA;
1946   const PetscScalar            *barray;
1947   PetscScalar                  *carray;
1948   PetscErrorCode               ierr;
1949   MatMatCusparse               *mmdata;
1950   Mat_SeqAIJCUSPARSEMultStruct *mat;
1951   CsrMatrix                    *csrmat;
1952   cudaError_t                  cerr;
1953 
1954   PetscFunctionBegin;
1955   MatCheckProduct(C,1);
1956   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1957   mmdata = (MatMatCusparse*)product->data;
1958   A    = product->A;
1959   B    = product->B;
1960   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1961   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1962   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1963      Instead of silently accepting the wrong answer, I prefer to raise the error */
1964   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1965   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1966   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1967   switch (product->type) {
1968   case MATPRODUCT_AB:
1969   case MATPRODUCT_PtAP:
1970     mat = cusp->mat;
1971     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1972     m   = A->rmap->n;
1973     n   = B->cmap->n;
1974     break;
1975   case MATPRODUCT_AtB:
1976     if (!cusp->transgen) {
1977       mat = cusp->mat;
1978       opA = CUSPARSE_OPERATION_TRANSPOSE;
1979     } else {
1980       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1981       mat  = cusp->matTranspose;
1982       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1983     }
1984     m = A->cmap->n;
1985     n = B->cmap->n;
1986     break;
1987   case MATPRODUCT_ABt:
1988   case MATPRODUCT_RARt:
1989     mat = cusp->mat;
1990     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1991     m   = A->rmap->n;
1992     n   = B->rmap->n;
1993     break;
1994   default:
1995     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1996   }
1997   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1998   csrmat = (CsrMatrix*)mat->mat;
1999   /* if the user passed a CPU matrix, copy the data to the GPU */
2000   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2001   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2002   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2003 
2004   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2005   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2006     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2007     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2008   } else {
2009     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2010     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2011   }
2012 
2013   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2014  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2015   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2016   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2017   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2018     size_t mmBufferSize;
2019     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2020     if (!mmdata->matBDescr) {
2021       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2022       mmdata->Blda = blda;
2023     }
2024 
2025     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2026     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2027       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2028       mmdata->Clda = clda;
2029     }
2030 
2031     if (!mat->matDescr) {
2032       stat = cusparseCreateCsr(&mat->matDescr,
2033                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2034                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2035                                csrmat->values->data().get(),
2036                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2037                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2038     }
2039     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2040                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2041                                    mmdata->matCDescr,cusparse_scalartype,
2042                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2043     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2044       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2045       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2046       mmdata->mmBufferSize = mmBufferSize;
2047     }
2048     mmdata->initialized = PETSC_TRUE;
2049   } else {
2050     /* to be safe, always update pointers of the mats */
2051     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2052     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2053     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2054   }
2055 
2056   /* do cusparseSpMM, which supports transpose on B */
2057   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2058                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2059                       mmdata->matCDescr,cusparse_scalartype,
2060                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2061  #else
2062   PetscInt k;
2063   /* cusparseXcsrmm does not support transpose on B */
2064   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2065     cublasHandle_t cublasv2handle;
2066     cublasStatus_t cerr;
2067 
2068     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2069     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2070                        B->cmap->n,B->rmap->n,
2071                        &PETSC_CUSPARSE_ONE ,barray,blda,
2072                        &PETSC_CUSPARSE_ZERO,barray,blda,
2073                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2074     blda = B->cmap->n;
2075     k    = B->cmap->n;
2076   } else {
2077     k    = B->rmap->n;
2078   }
2079 
2080   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2081   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2082                            csrmat->num_entries,mat->alpha_one,mat->descr,
2083                            csrmat->values->data().get(),
2084                            csrmat->row_offsets->data().get(),
2085                            csrmat->column_indices->data().get(),
2086                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2087                            carray,clda);CHKERRCUSPARSE(stat);
2088  #endif
2089   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2090   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2091   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2092   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2093   if (product->type == MATPRODUCT_RARt) {
2094     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2095     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2096   } else if (product->type == MATPRODUCT_PtAP) {
2097     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2098     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2099   } else {
2100     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2101   }
2102   if (mmdata->cisdense) {
2103     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2104   }
2105   if (!biscuda) {
2106     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2107   }
2108   PetscFunctionReturn(0);
2109 }
2110 
2111 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2112 {
2113   Mat_Product        *product = C->product;
2114   Mat                A,B;
2115   PetscInt           m,n;
2116   PetscBool          cisdense,flg;
2117   PetscErrorCode     ierr;
2118   MatMatCusparse     *mmdata;
2119   Mat_SeqAIJCUSPARSE *cusp;
2120 
2121   PetscFunctionBegin;
2122   MatCheckProduct(C,1);
2123   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2124   A    = product->A;
2125   B    = product->B;
2126   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2127   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2128   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2129   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2130   switch (product->type) {
2131   case MATPRODUCT_AB:
2132     m = A->rmap->n;
2133     n = B->cmap->n;
2134     break;
2135   case MATPRODUCT_AtB:
2136     m = A->cmap->n;
2137     n = B->cmap->n;
2138     break;
2139   case MATPRODUCT_ABt:
2140     m = A->rmap->n;
2141     n = B->rmap->n;
2142     break;
2143   case MATPRODUCT_PtAP:
2144     m = B->cmap->n;
2145     n = B->cmap->n;
2146     break;
2147   case MATPRODUCT_RARt:
2148     m = B->rmap->n;
2149     n = B->rmap->n;
2150     break;
2151   default:
2152     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2153   }
2154   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2155   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2156   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2157   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2158 
2159   /* product data */
2160   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2161   mmdata->cisdense = cisdense;
2162  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2163   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2164   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2165     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2166   }
2167  #endif
2168   /* for these products we need intermediate storage */
2169   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2170     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2171     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2172     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2173       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2174     } else {
2175       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2176     }
2177   }
2178   C->product->data    = mmdata;
2179   C->product->destroy = MatDestroy_MatMatCusparse;
2180 
2181   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2182   PetscFunctionReturn(0);
2183 }
2184 
2185 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2186 {
2187   Mat_Product                  *product = C->product;
2188   Mat                          A,B;
2189   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2190   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2191   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2192   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2193   PetscBool                    flg;
2194   PetscErrorCode               ierr;
2195   cusparseStatus_t             stat;
2196   cudaError_t                  cerr;
2197   MatProductType               ptype;
2198   MatMatCusparse               *mmdata;
2199 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2200   cusparseSpMatDescr_t         BmatSpDescr;
2201 #endif
2202 
2203   PetscFunctionBegin;
2204   MatCheckProduct(C,1);
2205   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2206   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2207   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2208   mmdata = (MatMatCusparse*)C->product->data;
2209   A = product->A;
2210   B = product->B;
2211   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2212     mmdata->reusesym = PETSC_FALSE;
2213     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2214     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2215     Cmat = Ccusp->mat;
2216     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2217     Ccsr = (CsrMatrix*)Cmat->mat;
2218     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2219     goto finalize;
2220   }
2221   if (!c->nz) goto finalize;
2222   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2223   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2224   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2225   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2226   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2227   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2228   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2229   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2230   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2231   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2232   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2233   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2234   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2235   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2236 
2237   ptype = product->type;
2238   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2239   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2240   switch (ptype) {
2241   case MATPRODUCT_AB:
2242     Amat = Acusp->mat;
2243     Bmat = Bcusp->mat;
2244     break;
2245   case MATPRODUCT_AtB:
2246     Amat = Acusp->matTranspose;
2247     Bmat = Bcusp->mat;
2248     break;
2249   case MATPRODUCT_ABt:
2250     Amat = Acusp->mat;
2251     Bmat = Bcusp->matTranspose;
2252     break;
2253   default:
2254     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2255   }
2256   Cmat = Ccusp->mat;
2257   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2258   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2259   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2260   Acsr = (CsrMatrix*)Amat->mat;
2261   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2262   Ccsr = (CsrMatrix*)Cmat->mat;
2263   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2264   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2265   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2266   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2268   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2269   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2270                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2271                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2272                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2273   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2274                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2275                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2276 #else
2277   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2278                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2279                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2280                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2281                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2282 #endif
2283   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2284   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2285   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2286   C->offloadmask = PETSC_OFFLOAD_GPU;
2287 finalize:
2288   /* shorter version of MatAssemblyEnd_SeqAIJ */
2289   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2290   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2291   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2292   c->reallocs         = 0;
2293   C->info.mallocs    += 0;
2294   C->info.nz_unneeded = 0;
2295   C->assembled = C->was_assembled = PETSC_TRUE;
2296   C->num_ass++;
2297   PetscFunctionReturn(0);
2298 }
2299 
2300 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2301 {
2302   Mat_Product                  *product = C->product;
2303   Mat                          A,B;
2304   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2305   Mat_SeqAIJ                   *a,*b,*c;
2306   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2307   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2308   PetscInt                     i,j,m,n,k;
2309   PetscBool                    flg;
2310   PetscErrorCode               ierr;
2311   cusparseStatus_t             stat;
2312   cudaError_t                  cerr;
2313   MatProductType               ptype;
2314   MatMatCusparse               *mmdata;
2315   PetscLogDouble               flops;
2316   PetscBool                    biscompressed,ciscompressed;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2319   size_t                       bufSize2;
2320   cusparseSpMatDescr_t         BmatSpDescr;
2321 #else
2322   int                          cnz;
2323 #endif
2324 
2325   PetscFunctionBegin;
2326   MatCheckProduct(C,1);
2327   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2328   A    = product->A;
2329   B    = product->B;
2330   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2331   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2332   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2333   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2334   a = (Mat_SeqAIJ*)A->data;
2335   b = (Mat_SeqAIJ*)B->data;
2336   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2337   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2338   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2339   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2340 
2341   /* product data */
2342   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2343   C->product->data    = mmdata;
2344   C->product->destroy = MatDestroy_MatMatCusparse;
2345 
2346   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2347   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2348   ptype = product->type;
2349   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2350   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2351   biscompressed = PETSC_FALSE;
2352   ciscompressed = PETSC_FALSE;
2353   switch (ptype) {
2354   case MATPRODUCT_AB:
2355     m = A->rmap->n;
2356     n = B->cmap->n;
2357     k = A->cmap->n;
2358     Amat = Acusp->mat;
2359     Bmat = Bcusp->mat;
2360     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2361     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2362     break;
2363   case MATPRODUCT_AtB:
2364     m = A->cmap->n;
2365     n = B->cmap->n;
2366     k = A->rmap->n;
2367     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2368     Amat = Acusp->matTranspose;
2369     Bmat = Bcusp->mat;
2370     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2371     break;
2372   case MATPRODUCT_ABt:
2373     m = A->rmap->n;
2374     n = B->rmap->n;
2375     k = A->cmap->n;
2376     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2377     Amat = Acusp->mat;
2378     Bmat = Bcusp->matTranspose;
2379     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2380     break;
2381   default:
2382     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2383   }
2384 
2385   /* create cusparse matrix */
2386   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2387   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2388   c     = (Mat_SeqAIJ*)C->data;
2389   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2390   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2391   Ccsr  = new CsrMatrix;
2392 
2393   c->compressedrow.use = ciscompressed;
2394   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2395     c->compressedrow.nrows = a->compressedrow.nrows;
2396     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2397     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2398     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2399     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2400     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2401   } else {
2402     c->compressedrow.nrows  = 0;
2403     c->compressedrow.i      = NULL;
2404     c->compressedrow.rindex = NULL;
2405     Ccusp->workVector       = NULL;
2406     Cmat->cprowIndices      = NULL;
2407   }
2408   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2409   Ccusp->mat      = Cmat;
2410   Ccusp->mat->mat = Ccsr;
2411   Ccsr->num_rows    = Ccusp->nrows;
2412   Ccsr->num_cols    = n;
2413   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2414   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2415   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2416   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2417   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2418   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2419   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2420   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2421   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2422   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2423   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2424     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2425     c->nz = 0;
2426     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2427     Ccsr->values = new THRUSTARRAY(c->nz);
2428     goto finalizesym;
2429   }
2430 
2431   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2432   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2433   Acsr = (CsrMatrix*)Amat->mat;
2434   if (!biscompressed) {
2435     Bcsr = (CsrMatrix*)Bmat->mat;
2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2437     BmatSpDescr = Bmat->matDescr;
2438 #endif
2439   } else { /* we need to use row offsets for the full matrix */
2440     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2441     Bcsr = new CsrMatrix;
2442     Bcsr->num_rows       = B->rmap->n;
2443     Bcsr->num_cols       = cBcsr->num_cols;
2444     Bcsr->num_entries    = cBcsr->num_entries;
2445     Bcsr->column_indices = cBcsr->column_indices;
2446     Bcsr->values         = cBcsr->values;
2447     if (!Bcusp->rowoffsets_gpu) {
2448       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2449       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2450       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2451     }
2452     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2453     mmdata->Bcsr = Bcsr;
2454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2455     if (Bcsr->num_rows && Bcsr->num_cols) {
2456       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2457                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2458                                Bcsr->values->data().get(),
2459                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2460                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2461     }
2462     BmatSpDescr = mmdata->matSpBDescr;
2463 #endif
2464   }
2465   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2466   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2467   /* precompute flops count */
2468   if (ptype == MATPRODUCT_AB) {
2469     for (i=0, flops = 0; i<A->rmap->n; i++) {
2470       const PetscInt st = a->i[i];
2471       const PetscInt en = a->i[i+1];
2472       for (j=st; j<en; j++) {
2473         const PetscInt brow = a->j[j];
2474         flops += 2.*(b->i[brow+1] - b->i[brow]);
2475       }
2476     }
2477   } else if (ptype == MATPRODUCT_AtB) {
2478     for (i=0, flops = 0; i<A->rmap->n; i++) {
2479       const PetscInt anzi = a->i[i+1] - a->i[i];
2480       const PetscInt bnzi = b->i[i+1] - b->i[i];
2481       flops += (2.*anzi)*bnzi;
2482     }
2483   } else { /* TODO */
2484     flops = 0.;
2485   }
2486 
2487   mmdata->flops = flops;
2488   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2491   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2492                            NULL, NULL, NULL,
2493                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2494                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2495   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2496   /* ask bufferSize bytes for external memory */
2497   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2498                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2499                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2500                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2501   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2502   /* inspect the matrices A and B to understand the memory requirement for the next step */
2503   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2504                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2505                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2506                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2507   /* ask bufferSize again bytes for external memory */
2508   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2509                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2510                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2511                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2512   /* The CUSPARSE documentation is not clear, nor the API
2513      We need both buffers to perform the operations properly!
2514      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2515      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2516      is stored in the descriptor! What a messy API... */
2517   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2518   /* compute the intermediate product of A * B */
2519   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2520                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2521                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2522                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2523   /* get matrix C non-zero entries C_nnz1 */
2524   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2525   c->nz = (PetscInt) C_nnz1;
2526   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2527   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2528   Ccsr->values = new THRUSTARRAY(c->nz);
2529   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2530   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2531                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2532   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2535 #else
2536   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2537   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2538                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2539                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2540                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2541                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2542   c->nz = cnz;
2543   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2544   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2545   Ccsr->values = new THRUSTARRAY(c->nz);
2546   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2547 
2548   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2549   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2550      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2551      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2552   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2553                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2554                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2555                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2556                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2557 #endif
2558   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2559   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2560   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2561 finalizesym:
2562   c->singlemalloc = PETSC_FALSE;
2563   c->free_a       = PETSC_TRUE;
2564   c->free_ij      = PETSC_TRUE;
2565   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2566   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2567   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2568     PetscInt *d_i = c->i;
2569     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2570     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2571     ii   = *Ccsr->row_offsets;
2572     jj   = *Ccsr->column_indices;
2573     if (ciscompressed) d_i = c->compressedrow.i;
2574     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2575     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2576   } else {
2577     PetscInt *d_i = c->i;
2578     if (ciscompressed) d_i = c->compressedrow.i;
2579     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2580     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2581   }
2582   if (ciscompressed) { /* need to expand host row offsets */
2583     PetscInt r = 0;
2584     c->i[0] = 0;
2585     for (k = 0; k < c->compressedrow.nrows; k++) {
2586       const PetscInt next = c->compressedrow.rindex[k];
2587       const PetscInt old = c->compressedrow.i[k];
2588       for (; r < next; r++) c->i[r+1] = old;
2589     }
2590     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2591   }
2592   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2593   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2594   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2595   c->maxnz = c->nz;
2596   c->nonzerorowcnt = 0;
2597   c->rmax = 0;
2598   for (k = 0; k < m; k++) {
2599     const PetscInt nn = c->i[k+1] - c->i[k];
2600     c->ilen[k] = c->imax[k] = nn;
2601     c->nonzerorowcnt += (PetscInt)!!nn;
2602     c->rmax = PetscMax(c->rmax,nn);
2603   }
2604   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2605   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2606   Ccsr->num_entries = c->nz;
2607 
2608   C->nonzerostate++;
2609   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2610   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2611   Ccusp->nonzerostate = C->nonzerostate;
2612   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2613   C->preallocated  = PETSC_TRUE;
2614   C->assembled     = PETSC_FALSE;
2615   C->was_assembled = PETSC_FALSE;
2616   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2617     mmdata->reusesym = PETSC_TRUE;
2618     C->offloadmask   = PETSC_OFFLOAD_GPU;
2619   }
2620   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2621   PetscFunctionReturn(0);
2622 }
2623 
2624 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2625 
2626 /* handles sparse or dense B */
2627 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2628 {
2629   Mat_Product    *product = mat->product;
2630   PetscErrorCode ierr;
2631   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2632 
2633   PetscFunctionBegin;
2634   MatCheckProduct(mat,1);
2635   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2636   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2637     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2638   }
2639   if (product->type == MATPRODUCT_ABC) {
2640     Ciscusp = PETSC_FALSE;
2641     if (!product->C->boundtocpu) {
2642       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2643     }
2644   }
2645   if (isdense) {
2646     switch (product->type) {
2647     case MATPRODUCT_AB:
2648     case MATPRODUCT_AtB:
2649     case MATPRODUCT_ABt:
2650     case MATPRODUCT_PtAP:
2651     case MATPRODUCT_RARt:
2652      if (product->A->boundtocpu) {
2653         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2654       } else {
2655         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2656       }
2657       break;
2658     case MATPRODUCT_ABC:
2659       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2660       break;
2661     default:
2662       break;
2663     }
2664   } else if (Biscusp && Ciscusp) {
2665     switch (product->type) {
2666     case MATPRODUCT_AB:
2667     case MATPRODUCT_AtB:
2668     case MATPRODUCT_ABt:
2669       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2670       break;
2671     case MATPRODUCT_PtAP:
2672     case MATPRODUCT_RARt:
2673     case MATPRODUCT_ABC:
2674       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2675       break;
2676     default:
2677       break;
2678     }
2679   } else { /* fallback for AIJ */
2680     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2681   }
2682   PetscFunctionReturn(0);
2683 }
2684 
2685 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2686 {
2687   PetscErrorCode ierr;
2688 
2689   PetscFunctionBegin;
2690   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2691   PetscFunctionReturn(0);
2692 }
2693 
2694 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2695 {
2696   PetscErrorCode ierr;
2697 
2698   PetscFunctionBegin;
2699   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2704 {
2705   PetscErrorCode ierr;
2706 
2707   PetscFunctionBegin;
2708   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2709   PetscFunctionReturn(0);
2710 }
2711 
2712 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2713 {
2714   PetscErrorCode ierr;
2715 
2716   PetscFunctionBegin;
2717   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2718   PetscFunctionReturn(0);
2719 }
2720 
2721 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2722 {
2723   PetscErrorCode ierr;
2724 
2725   PetscFunctionBegin;
2726   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2727   PetscFunctionReturn(0);
2728 }
2729 
2730 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2731 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2732 {
2733   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2734   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2735   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2736   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2737   PetscErrorCode               ierr;
2738   cudaError_t                  cerr;
2739   cusparseStatus_t             stat;
2740   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2741   PetscBool                    compressed;
2742 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2743   PetscInt                     nx,ny;
2744 #endif
2745 
2746   PetscFunctionBegin;
2747   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2748   if (!a->nonzerorowcnt) {
2749     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2750     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2751     PetscFunctionReturn(0);
2752   }
2753   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2754   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2755   if (!trans) {
2756     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2757     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2758   } else {
2759     if (herm || !cusparsestruct->transgen) {
2760       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2761       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2762     } else {
2763       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2764       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2765     }
2766   }
2767   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2768   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2769 
2770   try {
2771     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2772     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2773     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2774 
2775     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2776     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2777       /* z = A x + beta y.
2778          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2779          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2780       */
2781       xptr = xarray;
2782       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2783       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2784      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2785       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2786           allocated to accommodate different uses. So we get the length info directly from mat.
2787        */
2788       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2789         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2790         nx = mat->num_cols;
2791         ny = mat->num_rows;
2792       }
2793      #endif
2794     } else {
2795       /* z = A^T x + beta y
2796          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2797          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2798        */
2799       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2800       dptr = zarray;
2801       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2802       if (compressed) { /* Scatter x to work vector */
2803         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2804         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2805                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2806                          VecCUDAEqualsReverse());
2807       }
2808      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2809       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2810         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2811         nx = mat->num_rows;
2812         ny = mat->num_cols;
2813       }
2814      #endif
2815     }
2816 
2817     /* csr_spmv does y = alpha op(A) x + beta y */
2818     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2819      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2820       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2821       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2822         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2823         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2824         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2825                                 matstruct->matDescr,
2826                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2827                                 matstruct->cuSpMV[opA].vecYDescr,
2828                                 cusparse_scalartype,
2829                                 cusparsestruct->spmvAlg,
2830                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2831         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2832 
2833         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2834       } else {
2835         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2836         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2837         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2838       }
2839 
2840       stat = cusparseSpMV(cusparsestruct->handle, opA,
2841                                matstruct->alpha_one,
2842                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2843                                matstruct->cuSpMV[opA].vecXDescr,
2844                                beta,
2845                                matstruct->cuSpMV[opA].vecYDescr,
2846                                cusparse_scalartype,
2847                                cusparsestruct->spmvAlg,
2848                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2849      #else
2850       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2851       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2852                                mat->num_rows, mat->num_cols,
2853                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2854                                mat->values->data().get(), mat->row_offsets->data().get(),
2855                                mat->column_indices->data().get(), xptr, beta,
2856                                dptr);CHKERRCUSPARSE(stat);
2857      #endif
2858     } else {
2859       if (cusparsestruct->nrows) {
2860        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2861         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2862        #else
2863         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2864         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2865                                  matstruct->alpha_one, matstruct->descr, hybMat,
2866                                  xptr, beta,
2867                                  dptr);CHKERRCUSPARSE(stat);
2868        #endif
2869       }
2870     }
2871     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2872     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2873 
2874     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2875       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2876         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2877           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2878         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2879           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2880         }
2881       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2882         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2883       }
2884 
2885       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2886       if (compressed) {
2887         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2888         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2889         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2890                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2891                          VecCUDAPlusEquals());
2892         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2893         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2894       }
2895     } else {
2896       if (yy && yy != zz) {
2897         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2898       }
2899     }
2900     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2901     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2902     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2903   } catch(char *ex) {
2904     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2905   }
2906   if (yy) {
2907     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2908   } else {
2909     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2910   }
2911   PetscFunctionReturn(0);
2912 }
2913 
2914 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2915 {
2916   PetscErrorCode ierr;
2917 
2918   PetscFunctionBegin;
2919   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2920   PetscFunctionReturn(0);
2921 }
2922 
2923 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2924 {
2925   PetscErrorCode              ierr;
2926   PetscSplitCSRDataStructure  *d_mat = NULL;
2927   PetscFunctionBegin;
2928   if (A->factortype == MAT_FACTOR_NONE) {
2929     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2930   }
2931   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2932   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2933   if (d_mat) {
2934     A->offloadmask = PETSC_OFFLOAD_GPU;
2935   }
2936 
2937   PetscFunctionReturn(0);
2938 }
2939 
2940 /* --------------------------------------------------------------------------------*/
2941 /*@
2942    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2943    (the default parallel PETSc format). This matrix will ultimately pushed down
2944    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2945    assembly performance the user should preallocate the matrix storage by setting
2946    the parameter nz (or the array nnz).  By setting these parameters accurately,
2947    performance during matrix assembly can be increased by more than a factor of 50.
2948 
2949    Collective
2950 
2951    Input Parameters:
2952 +  comm - MPI communicator, set to PETSC_COMM_SELF
2953 .  m - number of rows
2954 .  n - number of columns
2955 .  nz - number of nonzeros per row (same for all rows)
2956 -  nnz - array containing the number of nonzeros in the various rows
2957          (possibly different for each row) or NULL
2958 
2959    Output Parameter:
2960 .  A - the matrix
2961 
2962    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2963    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2964    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2965 
2966    Notes:
2967    If nnz is given then nz is ignored
2968 
2969    The AIJ format (also called the Yale sparse matrix format or
2970    compressed row storage), is fully compatible with standard Fortran 77
2971    storage.  That is, the stored row and column indices can begin at
2972    either one (as in Fortran) or zero.  See the users' manual for details.
2973 
2974    Specify the preallocated storage with either nz or nnz (not both).
2975    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2976    allocation.  For large problems you MUST preallocate memory or you
2977    will get TERRIBLE performance, see the users' manual chapter on matrices.
2978 
2979    By default, this format uses inodes (identical nodes) when possible, to
2980    improve numerical efficiency of matrix-vector products and solves. We
2981    search for consecutive rows with the same nonzero structure, thereby
2982    reusing matrix information to achieve increased efficiency.
2983 
2984    Level: intermediate
2985 
2986 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2987 @*/
2988 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2989 {
2990   PetscErrorCode ierr;
2991 
2992   PetscFunctionBegin;
2993   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2994   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2995   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2996   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2997   PetscFunctionReturn(0);
2998 }
2999 
3000 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3001 {
3002   PetscErrorCode              ierr;
3003   PetscSplitCSRDataStructure  *d_mat = NULL;
3004 
3005   PetscFunctionBegin;
3006   if (A->factortype == MAT_FACTOR_NONE) {
3007     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3008     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3009     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3010   } else {
3011     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3012   }
3013   if (d_mat) {
3014     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3015     cudaError_t                err;
3016     PetscSplitCSRDataStructure h_mat;
3017     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3018     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3019     if (a->compressedrow.use) {
3020       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3021     }
3022     err = cudaFree(d_mat);CHKERRCUDA(err);
3023   }
3024   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3025   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3026   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3027   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3028   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3029   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3030   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3031   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3032   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3033   PetscFunctionReturn(0);
3034 }
3035 
3036 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3037 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3038 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3039 {
3040   PetscErrorCode ierr;
3041 
3042   PetscFunctionBegin;
3043   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3044   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3045   PetscFunctionReturn(0);
3046 }
3047 
3048 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3049 {
3050   PetscErrorCode     ierr;
3051   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3052   Mat_SeqAIJCUSPARSE *cy;
3053   Mat_SeqAIJCUSPARSE *cx;
3054   PetscScalar        *ay;
3055   const PetscScalar  *ax;
3056   CsrMatrix          *csry,*csrx;
3057   cudaError_t        cerr;
3058 
3059   PetscFunctionBegin;
3060   if (X->ops->axpy != Y->ops->axpy) {
3061     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3062     PetscFunctionReturn(0);
3063   }
3064   /* if we are here, it means both matrices are bound to GPU */
3065   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3066   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3067   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3068   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3069   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3070   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3071   csry = (CsrMatrix*)cy->mat->mat;
3072   csrx = (CsrMatrix*)cx->mat->mat;
3073   /* see if we can turn this into a cublas axpy */
3074   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3075     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3076     if (eq) {
3077       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3078     }
3079     if (eq) str = SAME_NONZERO_PATTERN;
3080   }
3081 
3082   if (str == SUBSET_NONZERO_PATTERN) {
3083     cusparseStatus_t stat;
3084     PetscScalar      b = 1.0;
3085 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3086     size_t           bufferSize;
3087     void             *buffer;
3088 #endif
3089 
3090     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3091     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3092     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3093 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3094     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3095                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3096                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3097                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3098     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3099     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3100     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3101                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3102                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3103                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3104     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3105     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3106     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3107     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3108 #else
3109     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3110     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3111                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3112                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3113                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3114     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3115     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3116     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3117 #endif
3118     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3119     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3120     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3121     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3122   } else if (str == SAME_NONZERO_PATTERN) {
3123     cublasHandle_t cublasv2handle;
3124     cublasStatus_t berr;
3125     PetscBLASInt   one = 1, bnz = 1;
3126 
3127     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3128     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3129     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3130     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3131     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3132     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3133     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3134     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3135     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3136     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3137     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3138     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3139   } else {
3140     ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
3141   }
3142   PetscFunctionReturn(0);
3143 }
3144 
3145 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3146 {
3147   PetscErrorCode             ierr;
3148   PetscBool                  both = PETSC_FALSE;
3149   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3150 
3151   PetscFunctionBegin;
3152   if (A->factortype == MAT_FACTOR_NONE) {
3153     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3154     if (spptr->mat) {
3155       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3156       if (matrix->values) {
3157         both = PETSC_TRUE;
3158         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3159       }
3160     }
3161     if (spptr->matTranspose) {
3162       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3163       if (matrix->values) {
3164         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3165       }
3166     }
3167   }
3168   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3169   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3170   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3171   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3172   else A->offloadmask = PETSC_OFFLOAD_CPU;
3173 
3174   PetscFunctionReturn(0);
3175 }
3176 
3177 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3178 {
3179   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3180   PetscErrorCode ierr;
3181 
3182   PetscFunctionBegin;
3183   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3184   if (flg) {
3185     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3186 
3187     A->ops->axpy                      = MatAXPY_SeqAIJ;
3188     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3189     A->ops->mult                      = MatMult_SeqAIJ;
3190     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3191     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3192     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3193     A->ops->multhermitiantranspose    = NULL;
3194     A->ops->multhermitiantransposeadd = NULL;
3195     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3196     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3197     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3198     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3199     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3200     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3201     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3202     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3203   } else {
3204     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3205     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3206     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3207     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3208     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3209     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3210     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3211     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3212     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3213     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3214     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3215     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3216     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3217     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3218     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3219     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3220   }
3221   A->boundtocpu = flg;
3222   a->inode.use = flg;
3223   PetscFunctionReturn(0);
3224 }
3225 
3226 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3227 {
3228   PetscErrorCode   ierr;
3229   cusparseStatus_t stat;
3230   Mat              B;
3231 
3232   PetscFunctionBegin;
3233   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3234   if (reuse == MAT_INITIAL_MATRIX) {
3235     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3236   } else if (reuse == MAT_REUSE_MATRIX) {
3237     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3238   }
3239   B = *newmat;
3240 
3241   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3242   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3243 
3244   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3245     if (B->factortype == MAT_FACTOR_NONE) {
3246       Mat_SeqAIJCUSPARSE *spptr;
3247 
3248       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3249       spptr->format = MAT_CUSPARSE_CSR;
3250       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3251       B->spptr = spptr;
3252       spptr->deviceMat = NULL;
3253     } else {
3254       Mat_SeqAIJCUSPARSETriFactors *spptr;
3255 
3256       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3257       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3258       B->spptr = spptr;
3259     }
3260     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3261   }
3262   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3263   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3264   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3265   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3266   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3267 
3268   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3269   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3270   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3271   PetscFunctionReturn(0);
3272 }
3273 
3274 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3275 {
3276   PetscErrorCode ierr;
3277 
3278   PetscFunctionBegin;
3279   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3280   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3281   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3282   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3283   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3284   PetscFunctionReturn(0);
3285 }
3286 
3287 /*MC
3288    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3289 
3290    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3291    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3292    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3293 
3294    Options Database Keys:
3295 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3296 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3297 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3298 
3299   Level: beginner
3300 
3301 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3302 M*/
3303 
3304 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3305 
3306 
3307 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3308 {
3309   PetscErrorCode ierr;
3310 
3311   PetscFunctionBegin;
3312   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3313   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3314   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3315   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3316   PetscFunctionReturn(0);
3317 }
3318 
3319 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3320 {
3321   PetscErrorCode   ierr;
3322   cusparseStatus_t stat;
3323 
3324   PetscFunctionBegin;
3325   if (*cusparsestruct) {
3326     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3327     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3328     delete (*cusparsestruct)->workVector;
3329     delete (*cusparsestruct)->rowoffsets_gpu;
3330     delete (*cusparsestruct)->cooPerm;
3331     delete (*cusparsestruct)->cooPerm_a;
3332     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3333    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3335    #endif
3336     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3337   }
3338   PetscFunctionReturn(0);
3339 }
3340 
3341 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3342 {
3343   PetscFunctionBegin;
3344   if (*mat) {
3345     delete (*mat)->values;
3346     delete (*mat)->column_indices;
3347     delete (*mat)->row_offsets;
3348     delete *mat;
3349     *mat = 0;
3350   }
3351   PetscFunctionReturn(0);
3352 }
3353 
3354 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3355 {
3356   cusparseStatus_t stat;
3357   PetscErrorCode   ierr;
3358 
3359   PetscFunctionBegin;
3360   if (*trifactor) {
3361     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3362     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3363     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3364     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3365     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3366    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3367     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3368    #endif
3369     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3370   }
3371   PetscFunctionReturn(0);
3372 }
3373 
3374 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3375 {
3376   CsrMatrix        *mat;
3377   cusparseStatus_t stat;
3378   cudaError_t      err;
3379 
3380   PetscFunctionBegin;
3381   if (*matstruct) {
3382     if ((*matstruct)->mat) {
3383       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3384        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3385         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3386        #else
3387         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3388         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3389        #endif
3390       } else {
3391         mat = (CsrMatrix*)(*matstruct)->mat;
3392         CsrMatrix_Destroy(&mat);
3393       }
3394     }
3395     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3396     delete (*matstruct)->cprowIndices;
3397     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3398     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3399     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3400 
3401    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3402     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3403     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3404     for (int i=0; i<3; i++) {
3405       if (mdata->cuSpMV[i].initialized) {
3406         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3407         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3408         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3409       }
3410     }
3411    #endif
3412     delete *matstruct;
3413     *matstruct = NULL;
3414   }
3415   PetscFunctionReturn(0);
3416 }
3417 
3418 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3419 {
3420   PetscErrorCode ierr;
3421 
3422   PetscFunctionBegin;
3423   if (*trifactors) {
3424     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3425     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3426     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3427     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3428     delete (*trifactors)->rpermIndices;
3429     delete (*trifactors)->cpermIndices;
3430     delete (*trifactors)->workVector;
3431     (*trifactors)->rpermIndices = NULL;
3432     (*trifactors)->cpermIndices = NULL;
3433     (*trifactors)->workVector = NULL;
3434   }
3435   PetscFunctionReturn(0);
3436 }
3437 
3438 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3439 {
3440   PetscErrorCode   ierr;
3441   cusparseHandle_t handle;
3442   cusparseStatus_t stat;
3443 
3444   PetscFunctionBegin;
3445   if (*trifactors) {
3446     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3447     if (handle = (*trifactors)->handle) {
3448       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3449     }
3450     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3451   }
3452   PetscFunctionReturn(0);
3453 }
3454 
3455 struct IJCompare
3456 {
3457   __host__ __device__
3458   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3459   {
3460     if (t1.get<0>() < t2.get<0>()) return true;
3461     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3462     return false;
3463   }
3464 };
3465 
3466 struct IJEqual
3467 {
3468   __host__ __device__
3469   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3470   {
3471     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3472     return true;
3473   }
3474 };
3475 
3476 struct IJDiff
3477 {
3478   __host__ __device__
3479   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3480   {
3481     return t1 == t2 ? 0 : 1;
3482   }
3483 };
3484 
3485 struct IJSum
3486 {
3487   __host__ __device__
3488   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3489   {
3490     return t1||t2;
3491   }
3492 };
3493 
3494 #include <thrust/iterator/discard_iterator.h>
3495 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3496 {
3497   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3498   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3499   THRUSTARRAY                           *cooPerm_v = NULL;
3500   thrust::device_ptr<const PetscScalar> d_v;
3501   CsrMatrix                             *matrix;
3502   PetscErrorCode                        ierr;
3503   cudaError_t                           cerr;
3504   PetscInt                              n;
3505 
3506   PetscFunctionBegin;
3507   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3508   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3509   if (!cusp->cooPerm) {
3510     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3511     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3512     PetscFunctionReturn(0);
3513   }
3514   matrix = (CsrMatrix*)cusp->mat->mat;
3515   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3516   if (!v) {
3517     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3518     goto finalize;
3519   }
3520   n = cusp->cooPerm->size();
3521   if (isCudaMem(v)) {
3522     d_v = thrust::device_pointer_cast(v);
3523   } else {
3524     cooPerm_v = new THRUSTARRAY(n);
3525     cooPerm_v->assign(v,v+n);
3526     d_v = cooPerm_v->data();
3527     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3528   }
3529   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3530   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3531     if (cusp->cooPerm_a) {
3532       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3533       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3534       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3535       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3536       delete cooPerm_w;
3537     } else {
3538       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3539                                                                 matrix->values->begin()));
3540       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3541                                                                 matrix->values->end()));
3542       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3543     }
3544   } else {
3545     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3546       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3547       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3548     } else {
3549       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3550                                                                 matrix->values->begin()));
3551       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3552                                                                 matrix->values->end()));
3553       thrust::for_each(zibit,zieit,VecCUDAEquals());
3554     }
3555   }
3556   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3557   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3558 finalize:
3559   delete cooPerm_v;
3560   A->offloadmask = PETSC_OFFLOAD_GPU;
3561   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3562   /* shorter version of MatAssemblyEnd_SeqAIJ */
3563   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3564   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3565   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3566   a->reallocs         = 0;
3567   A->info.mallocs    += 0;
3568   A->info.nz_unneeded = 0;
3569   A->assembled = A->was_assembled = PETSC_TRUE;
3570   A->num_ass++;
3571   PetscFunctionReturn(0);
3572 }
3573 
3574 #include <thrust/binary_search.h>
3575 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3576 {
3577   PetscErrorCode     ierr;
3578   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3579   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3580   PetscInt           cooPerm_n, nzr = 0;
3581   cudaError_t        cerr;
3582 
3583   PetscFunctionBegin;
3584   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3585   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3586   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3587   if (n != cooPerm_n) {
3588     delete cusp->cooPerm;
3589     delete cusp->cooPerm_a;
3590     cusp->cooPerm = NULL;
3591     cusp->cooPerm_a = NULL;
3592   }
3593   if (n) {
3594     THRUSTINTARRAY d_i(n);
3595     THRUSTINTARRAY d_j(n);
3596     THRUSTINTARRAY ii(A->rmap->n);
3597 
3598     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3599     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3600 
3601     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3602     d_i.assign(coo_i,coo_i+n);
3603     d_j.assign(coo_j,coo_j+n);
3604     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3605     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3606 
3607     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3608     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3609     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3610     *cusp->cooPerm_a = d_i;
3611     THRUSTINTARRAY w = d_j;
3612 
3613     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3614     if (nekey == ekey) { /* all entries are unique */
3615       delete cusp->cooPerm_a;
3616       cusp->cooPerm_a = NULL;
3617     } else { /* I couldn't come up with a more elegant algorithm */
3618       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3619       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3620       (*cusp->cooPerm_a)[0] = 0;
3621       w[0] = 0;
3622       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3623       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3624     }
3625     thrust::counting_iterator<PetscInt> search_begin(0);
3626     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3627                         search_begin, search_begin + A->rmap->n,
3628                         ii.begin());
3629     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3630     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3631 
3632     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3633     a->singlemalloc = PETSC_FALSE;
3634     a->free_a       = PETSC_TRUE;
3635     a->free_ij      = PETSC_TRUE;
3636     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3637     a->i[0] = 0;
3638     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3639     a->nz = a->maxnz = a->i[A->rmap->n];
3640     a->rmax = 0;
3641     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3642     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3643     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3644     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3645     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3646     for (PetscInt i = 0; i < A->rmap->n; i++) {
3647       const PetscInt nnzr = a->i[i+1] - a->i[i];
3648       nzr += (PetscInt)!!(nnzr);
3649       a->ilen[i] = a->imax[i] = nnzr;
3650       a->rmax = PetscMax(a->rmax,nnzr);
3651     }
3652     a->nonzerorowcnt = nzr;
3653     A->preallocated = PETSC_TRUE;
3654     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3655     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3656   } else {
3657     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3658   }
3659   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3660 
3661   /* We want to allocate the CUSPARSE struct for matvec now.
3662      The code is so convoluted now that I prefer to copy zeros */
3663   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3664   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3665   A->offloadmask = PETSC_OFFLOAD_CPU;
3666   A->nonzerostate++;
3667   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3668   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3669 
3670   A->assembled = PETSC_FALSE;
3671   A->was_assembled = PETSC_FALSE;
3672   PetscFunctionReturn(0);
3673 }
3674 
3675 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3676 {
3677   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3678   CsrMatrix          *csr;
3679   PetscErrorCode     ierr;
3680 
3681   PetscFunctionBegin;
3682   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3683   PetscValidPointer(a,2);
3684   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3685   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3686   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3687   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3688   csr = (CsrMatrix*)cusp->mat->mat;
3689   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3690   *a = csr->values->data().get();
3691   PetscFunctionReturn(0);
3692 }
3693 
3694 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3695 {
3696   PetscFunctionBegin;
3697   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3698   PetscValidPointer(a,2);
3699   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3700   *a = NULL;
3701   PetscFunctionReturn(0);
3702 }
3703 
3704 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3705 {
3706   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3707   CsrMatrix          *csr;
3708   PetscErrorCode     ierr;
3709 
3710   PetscFunctionBegin;
3711   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3712   PetscValidPointer(a,2);
3713   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3714   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3715   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3716   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3717   csr = (CsrMatrix*)cusp->mat->mat;
3718   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3719   *a = csr->values->data().get();
3720   A->offloadmask = PETSC_OFFLOAD_GPU;
3721   PetscFunctionReturn(0);
3722 }
3723 
3724 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3725 {
3726   PetscErrorCode ierr;
3727 
3728   PetscFunctionBegin;
3729   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3730   PetscValidPointer(a,2);
3731   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3732   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3733   *a = NULL;
3734   PetscFunctionReturn(0);
3735 }
3736 
3737 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3738 {
3739   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3740   CsrMatrix          *csr;
3741 
3742   PetscFunctionBegin;
3743   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3744   PetscValidPointer(a,2);
3745   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3746   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3747   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3748   csr = (CsrMatrix*)cusp->mat->mat;
3749   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3750   *a = csr->values->data().get();
3751   A->offloadmask = PETSC_OFFLOAD_GPU;
3752   PetscFunctionReturn(0);
3753 }
3754 
3755 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3756 {
3757   PetscErrorCode ierr;
3758 
3759   PetscFunctionBegin;
3760   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3761   PetscValidPointer(a,2);
3762   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3763   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3764   *a = NULL;
3765   PetscFunctionReturn(0);
3766 }
3767 
3768 struct IJCompare4
3769 {
3770   __host__ __device__
3771   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3772   {
3773     if (t1.get<0>() < t2.get<0>()) return true;
3774     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3775     return false;
3776   }
3777 };
3778 
3779 struct Shift
3780 {
3781   int _shift;
3782 
3783   Shift(int shift) : _shift(shift) {}
3784   __host__ __device__
3785   inline int operator() (const int &c)
3786   {
3787     return c + _shift;
3788   }
3789 };
3790 
3791 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3792 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3793 {
3794   PetscErrorCode               ierr;
3795   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3796   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3797   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3798   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3799   PetscInt                     Annz,Bnnz;
3800   cusparseStatus_t             stat;
3801   PetscInt                     i,m,n,zero = 0;
3802   cudaError_t                  cerr;
3803 
3804   PetscFunctionBegin;
3805   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3806   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3807   PetscValidPointer(C,4);
3808   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3809   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3810   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3811   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3812   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3813   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3814   if (reuse == MAT_INITIAL_MATRIX) {
3815     m     = A->rmap->n;
3816     n     = A->cmap->n + B->cmap->n;
3817     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3818     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3819     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3820     c     = (Mat_SeqAIJ*)(*C)->data;
3821     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3822     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3823     Ccsr  = new CsrMatrix;
3824     Cmat->cprowIndices      = NULL;
3825     c->compressedrow.use    = PETSC_FALSE;
3826     c->compressedrow.nrows  = 0;
3827     c->compressedrow.i      = NULL;
3828     c->compressedrow.rindex = NULL;
3829     Ccusp->workVector       = NULL;
3830     Ccusp->nrows    = m;
3831     Ccusp->mat      = Cmat;
3832     Ccusp->mat->mat = Ccsr;
3833     Ccsr->num_rows  = m;
3834     Ccsr->num_cols  = n;
3835     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3836     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3837     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3838     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3839     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3840     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3841     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3842     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3843     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3844     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3845     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3846     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3847     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3848     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3849     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3850 
3851     Acsr = (CsrMatrix*)Acusp->mat->mat;
3852     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3853     Annz = (PetscInt)Acsr->column_indices->size();
3854     Bnnz = (PetscInt)Bcsr->column_indices->size();
3855     c->nz = Annz + Bnnz;
3856     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3857     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3858     Ccsr->values = new THRUSTARRAY(c->nz);
3859     Ccsr->num_entries = c->nz;
3860     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3861     if (c->nz) {
3862       auto Acoo = new THRUSTINTARRAY32(Annz);
3863       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3864       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3865       THRUSTINTARRAY32 *Aroff,*Broff;
3866 
3867       if (a->compressedrow.use) { /* need full row offset */
3868         if (!Acusp->rowoffsets_gpu) {
3869           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3870           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3871           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3872         }
3873         Aroff = Acusp->rowoffsets_gpu;
3874       } else Aroff = Acsr->row_offsets;
3875       if (b->compressedrow.use) { /* need full row offset */
3876         if (!Bcusp->rowoffsets_gpu) {
3877           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3878           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3879           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3880         }
3881         Broff = Bcusp->rowoffsets_gpu;
3882       } else Broff = Bcsr->row_offsets;
3883       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3884       stat = cusparseXcsr2coo(Acusp->handle,
3885                               Aroff->data().get(),
3886                               Annz,
3887                               m,
3888                               Acoo->data().get(),
3889                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3890       stat = cusparseXcsr2coo(Bcusp->handle,
3891                               Broff->data().get(),
3892                               Bnnz,
3893                               m,
3894                               Bcoo->data().get(),
3895                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3896       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3897       auto Aperm = thrust::make_constant_iterator(1);
3898       auto Bperm = thrust::make_constant_iterator(0);
3899 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3900       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3901       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3902 #else
3903       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3904       auto Bcib = Bcsr->column_indices->begin();
3905       auto Bcie = Bcsr->column_indices->end();
3906       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3907 #endif
3908       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3909       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3910       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3911       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3912       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3913       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3914       auto p1 = Ccusp->cooPerm->begin();
3915       auto p2 = Ccusp->cooPerm->begin();
3916       thrust::advance(p2,Annz);
3917       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3918 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3919       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3920 #endif
3921       auto cci = thrust::make_counting_iterator(zero);
3922       auto cce = thrust::make_counting_iterator(c->nz);
3923 #if 0 //Errors on SUMMIT cuda 11.1.0
3924       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3925 #else
3926       auto pred = thrust::identity<int>();
3927       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
3928       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
3929 #endif
3930       stat = cusparseXcoo2csr(Ccusp->handle,
3931                               Ccoo->data().get(),
3932                               c->nz,
3933                               m,
3934                               Ccsr->row_offsets->data().get(),
3935                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3936       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3937       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3938       delete wPerm;
3939       delete Acoo;
3940       delete Bcoo;
3941       delete Ccoo;
3942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3943       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3944                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3945                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3946                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3947 #endif
3948       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3949         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3950         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3951         CsrMatrix *CcsrT = new CsrMatrix;
3952         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3953         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3954 
3955         Ccusp->transgen = PETSC_TRUE;
3956         CmatT->cprowIndices  = NULL;
3957         CmatT->mat = CcsrT;
3958         CcsrT->num_rows = n;
3959         CcsrT->num_cols = m;
3960         CcsrT->num_entries = c->nz;
3961 
3962         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3963         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3964         CcsrT->values = new THRUSTARRAY(c->nz);
3965 
3966         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3967         auto rT = CcsrT->row_offsets->begin();
3968         if (AT) {
3969           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3970           thrust::advance(rT,-1);
3971         }
3972         if (BT) {
3973           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3974           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3975           thrust::copy(titb,tite,rT);
3976         }
3977         auto cT = CcsrT->column_indices->begin();
3978         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3979         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3980         auto vT = CcsrT->values->begin();
3981         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3982         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3983         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3984         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3985 
3986         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3987         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3988         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3989         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3990         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3991         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3992         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3993         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3994         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3995 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3996         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3997                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3998                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3999                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4000 #endif
4001         Ccusp->matTranspose = CmatT;
4002       }
4003     }
4004 
4005     c->singlemalloc = PETSC_FALSE;
4006     c->free_a       = PETSC_TRUE;
4007     c->free_ij      = PETSC_TRUE;
4008     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4009     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4010     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4011       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4012       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4013       ii   = *Ccsr->row_offsets;
4014       jj   = *Ccsr->column_indices;
4015       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4016       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4017     } else {
4018       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4019       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4020     }
4021     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4022     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4023     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4024     c->maxnz = c->nz;
4025     c->nonzerorowcnt = 0;
4026     c->rmax = 0;
4027     for (i = 0; i < m; i++) {
4028       const PetscInt nn = c->i[i+1] - c->i[i];
4029       c->ilen[i] = c->imax[i] = nn;
4030       c->nonzerorowcnt += (PetscInt)!!nn;
4031       c->rmax = PetscMax(c->rmax,nn);
4032     }
4033     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4034     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4035     (*C)->nonzerostate++;
4036     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4037     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4038     Ccusp->nonzerostate = (*C)->nonzerostate;
4039     (*C)->preallocated  = PETSC_TRUE;
4040   } else {
4041     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4042     c = (Mat_SeqAIJ*)(*C)->data;
4043     if (c->nz) {
4044       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4045       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4046       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4047       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4048       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4049       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4050       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4051       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4052       Acsr = (CsrMatrix*)Acusp->mat->mat;
4053       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4054       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4055       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4056       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4057       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4058       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4059       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4060       auto pmid = Ccusp->cooPerm->begin();
4061       thrust::advance(pmid,Acsr->num_entries);
4062       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4063       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4064                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4065       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4066                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4067       thrust::for_each(zibait,zieait,VecCUDAEquals());
4068       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4069                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4070       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4071                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4072       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4073       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4074         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4075         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4076         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4077         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4078         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4079         auto vT = CcsrT->values->begin();
4080         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4081         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4082       }
4083       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4084       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4085     }
4086   }
4087   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4088   (*C)->assembled     = PETSC_TRUE;
4089   (*C)->was_assembled = PETSC_FALSE;
4090   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4091   PetscFunctionReturn(0);
4092 }
4093 
4094 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4095 {
4096   PetscErrorCode    ierr;
4097   bool              dmem;
4098   const PetscScalar *av;
4099   cudaError_t       cerr;
4100 
4101   PetscFunctionBegin;
4102   dmem = isCudaMem(v);
4103   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4104   if (n && idx) {
4105     THRUSTINTARRAY widx(n);
4106     widx.assign(idx,idx+n);
4107     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4108 
4109     THRUSTARRAY *w = NULL;
4110     thrust::device_ptr<PetscScalar> dv;
4111     if (dmem) {
4112       dv = thrust::device_pointer_cast(v);
4113     } else {
4114       w = new THRUSTARRAY(n);
4115       dv = w->data();
4116     }
4117     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4118 
4119     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4120     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4121     thrust::for_each(zibit,zieit,VecCUDAEquals());
4122     if (w) {
4123       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4124     }
4125     delete w;
4126   } else {
4127     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4128   }
4129   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4130   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4131   PetscFunctionReturn(0);
4132 }
4133