xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a2fddd78f770fa4fc19a8af67e65be331f27d92b)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303 
304     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307 
308     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311    #endif
312   }
313   ierr = PetscOptionsTail();CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
325   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332   PetscErrorCode               ierr;
333 
334   PetscFunctionBegin;
335   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
336   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344   PetscErrorCode               ierr;
345 
346   PetscFunctionBegin;
347   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354 {
355   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356   PetscErrorCode               ierr;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiLo, *AjLo;
375   PetscInt                          i,nz, nzLower, offset, rowOffset;
376   PetscErrorCode                    ierr;
377   cudaError_t                       cerr;
378 
379   PetscFunctionBegin;
380   if (!n) PetscFunctionReturn(0);
381   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
382     try {
383       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
384       nzLower=n+ai[n]-ai[1];
385       if (!loTriFactor) {
386         PetscScalar                       *AALo;
387 
388         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
389 
390         /* Allocate Space for the lower triangular matrix */
391         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
392         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
393 
394         /* Fill the lower triangular matrix */
395         AiLo[0]  = (PetscInt) 0;
396         AiLo[n]  = nzLower;
397         AjLo[0]  = (PetscInt) 0;
398         AALo[0]  = (MatScalar) 1.0;
399         v        = aa;
400         vi       = aj;
401         offset   = 1;
402         rowOffset= 1;
403         for (i=1; i<n; i++) {
404           nz = ai[i+1] - ai[i];
405           /* additional 1 for the term on the diagonal */
406           AiLo[i]    = rowOffset;
407           rowOffset += nz+1;
408 
409           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
411 
412           offset      += nz;
413           AjLo[offset] = (PetscInt) i;
414           AALo[offset] = (MatScalar) 1.0;
415           offset      += 1;
416 
417           v  += nz;
418           vi += nz;
419         }
420 
421         /* allocate space for the triangular factor information */
422         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424         /* Create the matrix description */
425         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
426         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
427        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429        #else
430         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431        #endif
432         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
433         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434 
435         /* set the operation */
436         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438         /* set the matrix */
439         loTriFactor->csrMat = new CsrMatrix;
440         loTriFactor->csrMat->num_rows = n;
441         loTriFactor->csrMat->num_cols = n;
442         loTriFactor->csrMat->num_entries = nzLower;
443 
444         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446 
447         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449 
450         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452 
453         /* Create the solve analysis information */
454         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
456       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463       #endif
464 
465         /* perform the solve analysis */
466         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469                                  loTriFactor->csrMat->column_indices->data().get(),
470                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471                                  loTriFactor->solveInfo,
472                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
473                                #else
474                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
475                                #endif
476         cerr = WaitForCUDA();CHKERRCUDA(cerr);
477         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
478 
479         /* assign the pointer */
480         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
481         loTriFactor->AA_h = AALo;
482         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
483         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
484         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
485       } else { /* update values only */
486         if (!loTriFactor->AA_h) {
487           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
488         }
489         /* Fill the lower triangular matrix */
490         loTriFactor->AA_h[0]  = 1.0;
491         v        = aa;
492         vi       = aj;
493         offset   = 1;
494         for (i=1; i<n; i++) {
495           nz = ai[i+1] - ai[i];
496           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
497           offset      += nz;
498           loTriFactor->AA_h[offset] = 1.0;
499           offset      += 1;
500           v  += nz;
501         }
502         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
503         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
504       }
505     } catch(char *ex) {
506       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
507     }
508   }
509   PetscFunctionReturn(0);
510 }
511 
512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
513 {
514   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
515   PetscInt                          n = A->rmap->n;
516   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
517   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
518   cusparseStatus_t                  stat;
519   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
520   const MatScalar                   *aa = a->a,*v;
521   PetscInt                          *AiUp, *AjUp;
522   PetscInt                          i,nz, nzUpper, offset;
523   PetscErrorCode                    ierr;
524   cudaError_t                       cerr;
525 
526   PetscFunctionBegin;
527   if (!n) PetscFunctionReturn(0);
528   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
529     try {
530       /* next, figure out the number of nonzeros in the upper triangular matrix. */
531       nzUpper = adiag[0]-adiag[n];
532       if (!upTriFactor) {
533         PetscScalar *AAUp;
534 
535         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536 
537         /* Allocate Space for the upper triangular matrix */
538         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
539         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540 
541         /* Fill the upper triangular matrix */
542         AiUp[0]=(PetscInt) 0;
543         AiUp[n]=nzUpper;
544         offset = nzUpper;
545         for (i=n-1; i>=0; i--) {
546           v  = aa + adiag[i+1] + 1;
547           vi = aj + adiag[i+1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i+1]-1;
551 
552           /* decrement the offset */
553           offset -= (nz+1);
554 
555           /* first, set the diagonal elements */
556           AjUp[offset] = (PetscInt) i;
557           AAUp[offset] = (MatScalar)1./v[nz];
558           AiUp[i]      = AiUp[i+1] - (nz+1);
559 
560           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
561           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
562         }
563 
564         /* allocate space for the triangular factor information */
565         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
566         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567 
568         /* Create the matrix description */
569         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
570         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
571        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
573        #else
574         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
575        #endif
576         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
577         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578 
579         /* set the operation */
580         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581 
582         /* set the matrix */
583         upTriFactor->csrMat = new CsrMatrix;
584         upTriFactor->csrMat->num_rows = n;
585         upTriFactor->csrMat->num_cols = n;
586         upTriFactor->csrMat->num_entries = nzUpper;
587 
588         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
589         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590 
591         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
592         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593 
594         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
595         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596 
597         /* Create the solve analysis information */
598         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
599         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
600       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
601         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
602                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
603                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
604                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
605                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
606         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
607       #endif
608 
609         /* perform the solve analysis */
610         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
611                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
612                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
613                                  upTriFactor->csrMat->column_indices->data().get(),
614                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615                                  upTriFactor->solveInfo,
616                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
617                                #else
618                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
619                                #endif
620         cerr = WaitForCUDA();CHKERRCUDA(cerr);
621         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
622 
623         /* assign the pointer */
624         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625         upTriFactor->AA_h = AAUp;
626         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
629       } else {
630         if (!upTriFactor->AA_h) {
631           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
632         }
633         /* Fill the upper triangular matrix */
634         offset = nzUpper;
635         for (i=n-1; i>=0; i--) {
636           v  = aa + adiag[i+1] + 1;
637 
638           /* number of elements NOT on the diagonal */
639           nz = adiag[i] - adiag[i+1]-1;
640 
641           /* decrement the offset */
642           offset -= (nz+1);
643 
644           /* first, set the diagonal elements */
645           upTriFactor->AA_h[offset] = 1./v[nz];
646           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
647         }
648         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
649         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
650       }
651     } catch(char *ex) {
652       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
653     }
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
659 {
660   PetscErrorCode               ierr;
661   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
662   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
663   IS                           isrow = a->row,iscol = a->icol;
664   PetscBool                    row_identity,col_identity;
665   PetscInt                     n = A->rmap->n;
666 
667   PetscFunctionBegin;
668   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
669   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
670   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
671 
672   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
673   cusparseTriFactors->nnz=a->nz;
674 
675   A->offloadmask = PETSC_OFFLOAD_BOTH;
676   /* lower triangular indices */
677   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
678   if (!row_identity && !cusparseTriFactors->rpermIndices) {
679     const PetscInt *r;
680 
681     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
682     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683     cusparseTriFactors->rpermIndices->assign(r, r+n);
684     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
685     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
686   }
687 
688   /* upper triangular indices */
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (!col_identity && !cusparseTriFactors->cpermIndices) {
691     const PetscInt *c;
692 
693     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
694     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
695     cusparseTriFactors->cpermIndices->assign(c, c+n);
696     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
697     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
698   }
699   PetscFunctionReturn(0);
700 }
701 
702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
703 {
704   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
705   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708   cusparseStatus_t                  stat;
709   PetscErrorCode                    ierr;
710   cudaError_t                       cerr;
711   PetscInt                          *AiUp, *AjUp;
712   PetscScalar                       *AAUp;
713   PetscScalar                       *AALo;
714   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
715   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
716   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
717   const MatScalar                   *aa = b->a,*v;
718 
719   PetscFunctionBegin;
720   if (!n) PetscFunctionReturn(0);
721   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
722     try {
723       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
724       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
725       if (!upTriFactor && !loTriFactor) {
726         /* Allocate Space for the upper triangular matrix */
727         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
728         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
729 
730         /* Fill the upper triangular matrix */
731         AiUp[0]=(PetscInt) 0;
732         AiUp[n]=nzUpper;
733         offset = 0;
734         for (i=0; i<n; i++) {
735           /* set the pointers */
736           v  = aa + ai[i];
737           vj = aj + ai[i];
738           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
739 
740           /* first, set the diagonal elements */
741           AjUp[offset] = (PetscInt) i;
742           AAUp[offset] = (MatScalar)1.0/v[nz];
743           AiUp[i]      = offset;
744           AALo[offset] = (MatScalar)1.0/v[nz];
745 
746           offset+=1;
747           if (nz>0) {
748             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
749             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
750             for (j=offset; j<offset+nz; j++) {
751               AAUp[j] = -AAUp[j];
752               AALo[j] = AAUp[j]/v[nz];
753             }
754             offset+=nz;
755           }
756         }
757 
758         /* allocate space for the triangular factor information */
759         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
760         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
761 
762         /* Create the matrix description */
763         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
764         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
765        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
766         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
767        #else
768         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
769        #endif
770         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
771         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
772 
773         /* set the matrix */
774         upTriFactor->csrMat = new CsrMatrix;
775         upTriFactor->csrMat->num_rows = A->rmap->n;
776         upTriFactor->csrMat->num_cols = A->cmap->n;
777         upTriFactor->csrMat->num_entries = a->nz;
778 
779         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
780         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
781 
782         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
783         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
784 
785         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
786         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
787 
788         /* set the operation */
789         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
790 
791         /* Create the solve analysis information */
792         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
793         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
794       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
795         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
796                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
797                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
798                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
799                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
800         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
801       #endif
802 
803         /* perform the solve analysis */
804         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
805                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
806                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
807                                  upTriFactor->csrMat->column_indices->data().get(),
808                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809                                  upTriFactor->solveInfo,
810                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
811                                 #else
812                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
813                                 #endif
814         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816 
817         /* assign the pointer */
818         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the operation */
836         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837 
838         /* set the matrix */
839         loTriFactor->csrMat = new CsrMatrix;
840         loTriFactor->csrMat->num_rows = A->rmap->n;
841         loTriFactor->csrMat->num_cols = A->cmap->n;
842         loTriFactor->csrMat->num_entries = a->nz;
843 
844         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                  loTriFactor->csrMat->column_indices->data().get(),
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  loTriFactor->solveInfo,
872                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873                                 #else
874                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875                                 #endif
876         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878 
879         /* assign the pointer */
880         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
881 
882         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
883         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
884         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
885       } else {
886         /* Fill the upper triangular matrix */
887         offset = 0;
888         for (i=0; i<n; i++) {
889           /* set the pointers */
890           v  = aa + ai[i];
891           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
892 
893           /* first, set the diagonal elements */
894           AAUp[offset] = 1.0/v[nz];
895           AALo[offset] = 1.0/v[nz];
896 
897           offset+=1;
898           if (nz>0) {
899             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
900             for (j=offset; j<offset+nz; j++) {
901               AAUp[j] = -AAUp[j];
902               AALo[j] = AAUp[j]/v[nz];
903             }
904             offset+=nz;
905           }
906         }
907         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
911         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
912       }
913       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
914       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
915     } catch(char *ex) {
916       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
917     }
918   }
919   PetscFunctionReturn(0);
920 }
921 
922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
923 {
924   PetscErrorCode               ierr;
925   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
926   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
927   IS                           ip = a->row;
928   PetscBool                    perm_identity;
929   PetscInt                     n = A->rmap->n;
930 
931   PetscFunctionBegin;
932   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
933   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
934   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
935   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
936 
937   A->offloadmask = PETSC_OFFLOAD_BOTH;
938 
939   /* lower triangular indices */
940   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
941   if (!perm_identity) {
942     IS             iip;
943     const PetscInt *irip,*rip;
944 
945     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
946     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
947     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
948     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
950     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
951     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
952     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
953     ierr = ISDestroy(&iip);CHKERRQ(ierr);
954     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
955     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961 {
962   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963   IS             ip = b->row;
964   PetscBool      perm_identity;
965   PetscErrorCode ierr;
966 
967   PetscFunctionBegin;
968   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1064                         #endif
1065   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067 
1068   /* Create the solve analysis information */
1069   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078 #endif
1079 
1080   /* perform the solve analysis */
1081   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                            loTriFactorT->csrMat->column_indices->data().get(),
1085                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086                            loTriFactorT->solveInfo,
1087                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1088                           #else
1089                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1090                           #endif
1091   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1092   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1093 
1094   /* assign the pointer */
1095   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1096 
1097   /*********************************************/
1098   /* Now the Transpose of the Upper Tri Factor */
1099   /*********************************************/
1100 
1101   /* allocate space for the transpose of the upper triangular factor */
1102   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1103   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1104 
1105   /* set the matrix descriptors of the upper triangular factor */
1106   matrixType = cusparseGetMatType(upTriFactor->descr);
1107   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1108   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1109     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1110   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1111 
1112   /* Create the matrix description */
1113   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1118 
1119   /* set the operation */
1120   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1121 
1122   /* allocate GPU space for the CSC of the upper triangular factor*/
1123   upTriFactorT->csrMat = new CsrMatrix;
1124   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1125   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1126   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1127   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1128   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1129   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1130 
1131   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1133   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1134                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1135                                 upTriFactor->csrMat->values->data().get(),
1136                                 upTriFactor->csrMat->row_offsets->data().get(),
1137                                 upTriFactor->csrMat->column_indices->data().get(),
1138                                 upTriFactorT->csrMat->values->data().get(),
1139                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1141                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1146   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1147                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1148                           upTriFactor->csrMat->values->data().get(),
1149                           upTriFactor->csrMat->row_offsets->data().get(),
1150                           upTriFactor->csrMat->column_indices->data().get(),
1151                           upTriFactorT->csrMat->values->data().get(),
1152                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                           CUSPARSE_ACTION_NUMERIC, indexBase,
1155                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1156                         #else
1157                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1158                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1159                         #endif
1160 
1161   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1162   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1163 
1164   /* Create the solve analysis information */
1165   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1166   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1167   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1172                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1173   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1174   #endif
1175 
1176   /* perform the solve analysis */
1177   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180                            upTriFactorT->csrMat->column_indices->data().get(),
1181                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182                            upTriFactorT->solveInfo,
1183                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1184                           #else
1185                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1186                           #endif
1187 
1188   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1189   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1190 
1191   /* assign the pointer */
1192   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 struct PetscScalarToPetscInt
1197 {
1198   __host__ __device__
1199   PetscInt operator()(PetscScalar s)
1200   {
1201     return (PetscInt)PetscRealPart(s);
1202   }
1203 };
1204 
1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1206 {
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   cusparseStatus_t             stat;
1211   cusparseIndexBase_t          indexBase;
1212   cudaError_t                  err;
1213   PetscErrorCode               ierr;
1214 
1215   PetscFunctionBegin;
1216   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1217   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1218   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1220   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1221   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1222   if (A->transupdated) PetscFunctionReturn(0);
1223   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1226     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1227   }
1228   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1229     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1230     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1231     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1232     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1233     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1234 
1235     /* set alpha and beta */
1236     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1237     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1238     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1240     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242 
1243     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1244       CsrMatrix *matrixT = new CsrMatrix;
1245       matstructT->mat = matrixT;
1246       matrixT->num_rows = A->cmap->n;
1247       matrixT->num_cols = A->rmap->n;
1248       matrixT->num_entries = a->nz;
1249       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1250       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1251       matrixT->values = new THRUSTARRAY(a->nz);
1252 
1253       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1254       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1255 
1256      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257       stat = cusparseCreateCsr(&matstructT->matDescr,
1258                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1259                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1260                                matrixT->values->data().get(),
1261                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1263      #endif
1264     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1265    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1266       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1267    #else
1268       CsrMatrix *temp  = new CsrMatrix;
1269       CsrMatrix *tempT = new CsrMatrix;
1270       /* First convert HYB to CSR */
1271       temp->num_rows = A->rmap->n;
1272       temp->num_cols = A->cmap->n;
1273       temp->num_entries = a->nz;
1274       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1275       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1276       temp->values = new THRUSTARRAY(a->nz);
1277 
1278       stat = cusparse_hyb2csr(cusparsestruct->handle,
1279                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1280                               temp->values->data().get(),
1281                               temp->row_offsets->data().get(),
1282                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1283 
1284       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1285       tempT->num_rows = A->rmap->n;
1286       tempT->num_cols = A->cmap->n;
1287       tempT->num_entries = a->nz;
1288       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1290       tempT->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1293                               temp->num_cols, temp->num_entries,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get(),
1297                               tempT->values->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               tempT->row_offsets->data().get(),
1300                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1301 
1302       /* Last, convert CSC to HYB */
1303       cusparseHybMat_t hybMat;
1304       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1305       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1306         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1307       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1308                               matstructT->descr, tempT->values->data().get(),
1309                               tempT->row_offsets->data().get(),
1310                               tempT->column_indices->data().get(),
1311                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1312 
1313       /* assign the pointer */
1314       matstructT->mat = hybMat;
1315       A->transupdated = PETSC_TRUE;
1316       /* delete temporaries */
1317       if (tempT) {
1318         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1319         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1320         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1321         delete (CsrMatrix*) tempT;
1322       }
1323       if (temp) {
1324         if (temp->values) delete (THRUSTARRAY*) temp->values;
1325         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1326         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1327         delete (CsrMatrix*) temp;
1328       }
1329      #endif
1330     }
1331   }
1332   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1333     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1334     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1335     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1336     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1337     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1338     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1339     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1340     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1341     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1342     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1343     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1344       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1345       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1346       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1347     }
1348     if (!cusparsestruct->csr2csc_i) {
1349       THRUSTARRAY csr2csc_a(matrix->num_entries);
1350       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1351 
1352       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1353      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354       void   *csr2cscBuffer;
1355       size_t csr2cscBufferSize;
1356       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1357                                            A->cmap->n, matrix->num_entries,
1358                                            matrix->values->data().get(),
1359                                            cusparsestruct->rowoffsets_gpu->data().get(),
1360                                            matrix->column_indices->data().get(),
1361                                            matrixT->values->data().get(),
1362                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1363                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1364                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1365       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1366      #endif
1367 
1368       if (matrix->num_entries) {
1369         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1370            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1371            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1372 
1373            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1374            should be filled with indexBase. So I just take a shortcut here.
1375         */
1376         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1377                               A->cmap->n,matrix->num_entries,
1378                               csr2csc_a.data().get(),
1379                               cusparsestruct->rowoffsets_gpu->data().get(),
1380                               matrix->column_indices->data().get(),
1381                               matrixT->values->data().get(),
1382                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1384                               CUSPARSE_ACTION_NUMERIC,indexBase,
1385                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1386                              #else
1387                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1388                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1389                              #endif
1390       } else {
1391         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1392       }
1393 
1394       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1395       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1396      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1398      #endif
1399     }
1400     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1401                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1402                                                      matrixT->values->begin()));
1403   }
1404   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1406   /* the compressed row indices is not used for matTranspose */
1407   matstructT->cprowIndices = NULL;
1408   /* assign the pointer */
1409   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1410   A->transupdated = PETSC_TRUE;
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1416 {
1417   PetscInt                              n = xx->map->n;
1418   const PetscScalar                     *barray;
1419   PetscScalar                           *xarray;
1420   thrust::device_ptr<const PetscScalar> bGPU;
1421   thrust::device_ptr<PetscScalar>       xGPU;
1422   cusparseStatus_t                      stat;
1423   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1424   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1427   PetscErrorCode                        ierr;
1428 
1429   PetscFunctionBegin;
1430   /* Analyze the matrix and create the transpose ... on the fly */
1431   if (!loTriFactorT && !upTriFactorT) {
1432     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1433     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1434     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1435   }
1436 
1437   /* Get the GPU pointers */
1438   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1440   xGPU = thrust::device_pointer_cast(xarray);
1441   bGPU = thrust::device_pointer_cast(barray);
1442 
1443   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1444   /* First, reorder with the row permutation */
1445   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1446                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1447                xGPU);
1448 
1449   /* First, solve U */
1450   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1451                         upTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         upTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1456                         upTriFactorT->csrMat->values->data().get(),
1457                         upTriFactorT->csrMat->row_offsets->data().get(),
1458                         upTriFactorT->csrMat->column_indices->data().get(),
1459                         upTriFactorT->solveInfo,
1460                         xarray,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         tempGPU->data().get(),
1463                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1464                       #else
1465                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1466                       #endif
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(),
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         xarray,
1482                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1483                       #else
1484                          xarray);CHKERRCUSPARSE(stat);
1485                       #endif
1486 
1487   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1488   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1489                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1490                tempGPU->begin());
1491 
1492   /* Copy the temporary to the full solution. */
1493   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1494 
1495   /* restore */
1496   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1497   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1498   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1504 {
1505   const PetscScalar                 *barray;
1506   PetscScalar                       *xarray;
1507   cusparseStatus_t                  stat;
1508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1511   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1512   PetscErrorCode                    ierr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray,
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         tempGPU->data().get(),
1541                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1542                       #else
1543                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1544                       #endif
1545 
1546   /* Then, solve L */
1547   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1548                         loTriFactorT->csrMat->num_rows,
1549                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550                         loTriFactorT->csrMat->num_entries,
1551                       #endif
1552                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1553                         loTriFactorT->csrMat->values->data().get(),
1554                         loTriFactorT->csrMat->row_offsets->data().get(),
1555                         loTriFactorT->csrMat->column_indices->data().get(),
1556                         loTriFactorT->solveInfo,
1557                         tempGPU->data().get(),
1558                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559                         xarray,
1560                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1561                       #else
1562                         xarray);CHKERRCUSPARSE(stat);
1563                       #endif
1564 
1565   /* restore */
1566   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1574 {
1575   const PetscScalar                     *barray;
1576   PetscScalar                           *xarray;
1577   thrust::device_ptr<const PetscScalar> bGPU;
1578   thrust::device_ptr<PetscScalar>       xGPU;
1579   cusparseStatus_t                      stat;
1580   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1581   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1582   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1583   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1584   PetscErrorCode                        ierr;
1585 
1586   PetscFunctionBegin;
1587 
1588   /* Get the GPU pointers */
1589   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1590   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1591   xGPU = thrust::device_pointer_cast(xarray);
1592   bGPU = thrust::device_pointer_cast(barray);
1593 
1594   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1595   /* First, reorder with the row permutation */
1596   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1597                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1598                tempGPU->begin());
1599 
1600   /* Next, solve L */
1601   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1602                         loTriFactor->csrMat->num_rows,
1603                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604                         loTriFactor->csrMat->num_entries,
1605                       #endif
1606                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1607                         loTriFactor->csrMat->values->data().get(),
1608                         loTriFactor->csrMat->row_offsets->data().get(),
1609                         loTriFactor->csrMat->column_indices->data().get(),
1610                         loTriFactor->solveInfo,
1611                         tempGPU->data().get(),
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                          xarray,
1614                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1615                       #else
1616                          xarray);CHKERRCUSPARSE(stat);
1617                       #endif
1618 
1619   /* Then, solve U */
1620   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1621                         upTriFactor->csrMat->num_rows,
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                         upTriFactor->csrMat->num_entries,
1624                       #endif
1625                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1626                         upTriFactor->csrMat->values->data().get(),
1627                         upTriFactor->csrMat->row_offsets->data().get(),
1628                         upTriFactor->csrMat->column_indices->data().get(),
1629                         upTriFactor->solveInfo,xarray,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         tempGPU->data().get(),
1632                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1633                       #else
1634                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1635                       #endif
1636 
1637   /* Last, reorder with the column permutation */
1638   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1639                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1640                xGPU);
1641 
1642   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1644   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1650 {
1651   const PetscScalar                 *barray;
1652   PetscScalar                       *xarray;
1653   cusparseStatus_t                  stat;
1654   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1655   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1656   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1657   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1658   PetscErrorCode                    ierr;
1659 
1660   PetscFunctionBegin;
1661   /* Get the GPU pointers */
1662   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1663   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1664 
1665   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1666   /* First, solve L */
1667   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1668                         loTriFactor->csrMat->num_rows,
1669                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670                         loTriFactor->csrMat->num_entries,
1671                       #endif
1672                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1673                         loTriFactor->csrMat->values->data().get(),
1674                         loTriFactor->csrMat->row_offsets->data().get(),
1675                         loTriFactor->csrMat->column_indices->data().get(),
1676                         loTriFactor->solveInfo,
1677                         barray,
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         tempGPU->data().get(),
1680                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1681                       #else
1682                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1683                       #endif
1684 
1685   /* Next, solve U */
1686   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687                         upTriFactor->csrMat->num_rows,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         upTriFactor->csrMat->num_entries,
1690                       #endif
1691                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692                         upTriFactor->csrMat->values->data().get(),
1693                         upTriFactor->csrMat->row_offsets->data().get(),
1694                         upTriFactor->csrMat->column_indices->data().get(),
1695                         upTriFactor->solveInfo,
1696                         tempGPU->data().get(),
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         xarray,
1699                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1700                       #else
1701                         xarray);CHKERRCUSPARSE(stat);
1702                       #endif
1703 
1704   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1705   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1706   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1707   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1712 {
1713   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1714   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1715   cudaError_t        cerr;
1716   PetscErrorCode     ierr;
1717 
1718   PetscFunctionBegin;
1719   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1720     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1721 
1722     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1723     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1724     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1725     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1726     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1727     A->offloadmask = PETSC_OFFLOAD_BOTH;
1728   }
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1733 {
1734   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1735   PetscErrorCode ierr;
1736 
1737   PetscFunctionBegin;
1738   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1739   *array = a->a;
1740   A->offloadmask = PETSC_OFFLOAD_CPU;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1745 {
1746   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1747   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1748   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1749   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1750   PetscErrorCode               ierr;
1751   cusparseStatus_t             stat;
1752   PetscBool                    both = PETSC_TRUE;
1753   cudaError_t                  err;
1754 
1755   PetscFunctionBegin;
1756   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1757   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1758     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1759       CsrMatrix *matrix;
1760       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1761 
1762       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1763       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1764       matrix->values->assign(a->a, a->a+a->nz);
1765       err  = WaitForCUDA();CHKERRCUDA(err);
1766       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1767       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1768       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1769     } else {
1770       PetscInt nnz;
1771       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1772       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1773       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1774       delete cusparsestruct->workVector;
1775       delete cusparsestruct->rowoffsets_gpu;
1776       cusparsestruct->workVector = NULL;
1777       cusparsestruct->rowoffsets_gpu = NULL;
1778       try {
1779         if (a->compressedrow.use) {
1780           m    = a->compressedrow.nrows;
1781           ii   = a->compressedrow.i;
1782           ridx = a->compressedrow.rindex;
1783         } else {
1784           m    = A->rmap->n;
1785           ii   = a->i;
1786           ridx = NULL;
1787         }
1788         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1789         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1790         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1791         else nnz = a->nz;
1792 
1793         /* create cusparse matrix */
1794         cusparsestruct->nrows = m;
1795         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1796         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1797         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1798         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1799 
1800         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1801         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1802         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1803         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1804         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1805         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1806         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1807 
1808         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1809         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1810           /* set the matrix */
1811           CsrMatrix *mat= new CsrMatrix;
1812           mat->num_rows = m;
1813           mat->num_cols = A->cmap->n;
1814           mat->num_entries = nnz;
1815           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1816           mat->row_offsets->assign(ii, ii + m+1);
1817 
1818           mat->column_indices = new THRUSTINTARRAY32(nnz);
1819           mat->column_indices->assign(a->j, a->j+nnz);
1820 
1821           mat->values = new THRUSTARRAY(nnz);
1822           if (a->a) mat->values->assign(a->a, a->a+nnz);
1823 
1824           /* assign the pointer */
1825           matstruct->mat = mat;
1826          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1827           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1828             stat = cusparseCreateCsr(&matstruct->matDescr,
1829                                     mat->num_rows, mat->num_cols, mat->num_entries,
1830                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1831                                     mat->values->data().get(),
1832                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1833                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1834           }
1835          #endif
1836         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1837          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1838           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1839          #else
1840           CsrMatrix *mat= new CsrMatrix;
1841           mat->num_rows = m;
1842           mat->num_cols = A->cmap->n;
1843           mat->num_entries = nnz;
1844           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1845           mat->row_offsets->assign(ii, ii + m+1);
1846 
1847           mat->column_indices = new THRUSTINTARRAY32(nnz);
1848           mat->column_indices->assign(a->j, a->j+nnz);
1849 
1850           mat->values = new THRUSTARRAY(nnz);
1851           if (a->a) mat->values->assign(a->a, a->a+nnz);
1852 
1853           cusparseHybMat_t hybMat;
1854           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1855           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1856             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1857           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1858               matstruct->descr, mat->values->data().get(),
1859               mat->row_offsets->data().get(),
1860               mat->column_indices->data().get(),
1861               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1862           /* assign the pointer */
1863           matstruct->mat = hybMat;
1864 
1865           if (mat) {
1866             if (mat->values) delete (THRUSTARRAY*)mat->values;
1867             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1868             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1869             delete (CsrMatrix*)mat;
1870           }
1871          #endif
1872         }
1873 
1874         /* assign the compressed row indices */
1875         if (a->compressedrow.use) {
1876           cusparsestruct->workVector = new THRUSTARRAY(m);
1877           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1878           matstruct->cprowIndices->assign(ridx,ridx+m);
1879           tmp = m;
1880         } else {
1881           cusparsestruct->workVector = NULL;
1882           matstruct->cprowIndices    = NULL;
1883           tmp = 0;
1884         }
1885         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1886 
1887         /* assign the pointer */
1888         cusparsestruct->mat = matstruct;
1889       } catch(char *ex) {
1890         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1891       }
1892       err  = WaitForCUDA();CHKERRCUDA(err);
1893       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1894       cusparsestruct->nonzerostate = A->nonzerostate;
1895     }
1896     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1897   }
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 struct VecCUDAPlusEquals
1902 {
1903   template <typename Tuple>
1904   __host__ __device__
1905   void operator()(Tuple t)
1906   {
1907     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1908   }
1909 };
1910 
1911 struct VecCUDAEquals
1912 {
1913   template <typename Tuple>
1914   __host__ __device__
1915   void operator()(Tuple t)
1916   {
1917     thrust::get<1>(t) = thrust::get<0>(t);
1918   }
1919 };
1920 
1921 struct VecCUDAEqualsReverse
1922 {
1923   template <typename Tuple>
1924   __host__ __device__
1925   void operator()(Tuple t)
1926   {
1927     thrust::get<0>(t) = thrust::get<1>(t);
1928   }
1929 };
1930 
1931 struct MatMatCusparse {
1932   PetscBool             cisdense;
1933   PetscScalar           *Bt;
1934   Mat                   X;
1935   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1936   PetscLogDouble        flops;
1937   CsrMatrix             *Bcsr;
1938 
1939 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1940   cusparseSpMatDescr_t  matSpBDescr;
1941   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1942   cusparseDnMatDescr_t  matBDescr;
1943   cusparseDnMatDescr_t  matCDescr;
1944   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1945  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1946   void                  *dBuffer4;
1947   void                  *dBuffer5;
1948  #endif
1949   size_t                mmBufferSize;
1950   void                  *mmBuffer;
1951   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1952   cusparseSpGEMMDescr_t spgemmDesc;
1953 #endif
1954 };
1955 
1956 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1957 {
1958   PetscErrorCode   ierr;
1959   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1960   cudaError_t      cerr;
1961  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1962   cusparseStatus_t stat;
1963  #endif
1964 
1965   PetscFunctionBegin;
1966   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1967   delete mmdata->Bcsr;
1968  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1969   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1970   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1971   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1972   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1973  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1974   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1975   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
1976  #endif
1977   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1978   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1979  #endif
1980   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1981   ierr = PetscFree(data);CHKERRQ(ierr);
1982   PetscFunctionReturn(0);
1983 }
1984 
1985 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1986 
1987 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1988 {
1989   Mat_Product                  *product = C->product;
1990   Mat                          A,B;
1991   PetscInt                     m,n,blda,clda;
1992   PetscBool                    flg,biscuda;
1993   Mat_SeqAIJCUSPARSE           *cusp;
1994   cusparseStatus_t             stat;
1995   cusparseOperation_t          opA;
1996   const PetscScalar            *barray;
1997   PetscScalar                  *carray;
1998   PetscErrorCode               ierr;
1999   MatMatCusparse               *mmdata;
2000   Mat_SeqAIJCUSPARSEMultStruct *mat;
2001   CsrMatrix                    *csrmat;
2002 
2003   PetscFunctionBegin;
2004   MatCheckProduct(C,1);
2005   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2006   mmdata = (MatMatCusparse*)product->data;
2007   A    = product->A;
2008   B    = product->B;
2009   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2010   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2011   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2012      Instead of silently accepting the wrong answer, I prefer to raise the error */
2013   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2014   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2015   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2016   switch (product->type) {
2017   case MATPRODUCT_AB:
2018   case MATPRODUCT_PtAP:
2019     mat = cusp->mat;
2020     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2021     m   = A->rmap->n;
2022     n   = B->cmap->n;
2023     break;
2024   case MATPRODUCT_AtB:
2025     if (!A->form_explicit_transpose) {
2026       mat = cusp->mat;
2027       opA = CUSPARSE_OPERATION_TRANSPOSE;
2028     } else {
2029       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2030       mat  = cusp->matTranspose;
2031       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2032     }
2033     m = A->cmap->n;
2034     n = B->cmap->n;
2035     break;
2036   case MATPRODUCT_ABt:
2037   case MATPRODUCT_RARt:
2038     mat = cusp->mat;
2039     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2040     m   = A->rmap->n;
2041     n   = B->rmap->n;
2042     break;
2043   default:
2044     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2045   }
2046   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2047   csrmat = (CsrMatrix*)mat->mat;
2048   /* if the user passed a CPU matrix, copy the data to the GPU */
2049   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2050   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2051   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2052 
2053   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2054   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2055     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2056     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2057   } else {
2058     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2059     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2060   }
2061 
2062   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2063  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2064   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2065   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2066   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2067     size_t mmBufferSize;
2068     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2069     if (!mmdata->matBDescr) {
2070       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2071       mmdata->Blda = blda;
2072     }
2073 
2074     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2075     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2076       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2077       mmdata->Clda = clda;
2078     }
2079 
2080     if (!mat->matDescr) {
2081       stat = cusparseCreateCsr(&mat->matDescr,
2082                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2083                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2084                                csrmat->values->data().get(),
2085                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2086                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2087     }
2088     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2089                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2090                                    mmdata->matCDescr,cusparse_scalartype,
2091                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2092     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2093       cudaError_t cerr;
2094       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2095       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2096       mmdata->mmBufferSize = mmBufferSize;
2097     }
2098     mmdata->initialized = PETSC_TRUE;
2099   } else {
2100     /* to be safe, always update pointers of the mats */
2101     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2102     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2103     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2104   }
2105 
2106   /* do cusparseSpMM, which supports transpose on B */
2107   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2108                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2109                       mmdata->matCDescr,cusparse_scalartype,
2110                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2111  #else
2112   PetscInt k;
2113   /* cusparseXcsrmm does not support transpose on B */
2114   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2115     cublasHandle_t cublasv2handle;
2116     cublasStatus_t cerr;
2117 
2118     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2119     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2120                        B->cmap->n,B->rmap->n,
2121                        &PETSC_CUSPARSE_ONE ,barray,blda,
2122                        &PETSC_CUSPARSE_ZERO,barray,blda,
2123                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2124     blda = B->cmap->n;
2125     k    = B->cmap->n;
2126   } else {
2127     k    = B->rmap->n;
2128   }
2129 
2130   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2131   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2132                            csrmat->num_entries,mat->alpha_one,mat->descr,
2133                            csrmat->values->data().get(),
2134                            csrmat->row_offsets->data().get(),
2135                            csrmat->column_indices->data().get(),
2136                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2137                            carray,clda);CHKERRCUSPARSE(stat);
2138  #endif
2139   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2140   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2141   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2142   if (product->type == MATPRODUCT_RARt) {
2143     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2144     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2145   } else if (product->type == MATPRODUCT_PtAP) {
2146     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2147     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2148   } else {
2149     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2150   }
2151   if (mmdata->cisdense) {
2152     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2153   }
2154   if (!biscuda) {
2155     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2156   }
2157   PetscFunctionReturn(0);
2158 }
2159 
2160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2161 {
2162   Mat_Product        *product = C->product;
2163   Mat                A,B;
2164   PetscInt           m,n;
2165   PetscBool          cisdense,flg;
2166   PetscErrorCode     ierr;
2167   MatMatCusparse     *mmdata;
2168   Mat_SeqAIJCUSPARSE *cusp;
2169 
2170   PetscFunctionBegin;
2171   MatCheckProduct(C,1);
2172   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2173   A    = product->A;
2174   B    = product->B;
2175   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2176   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2177   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2178   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2179   switch (product->type) {
2180   case MATPRODUCT_AB:
2181     m = A->rmap->n;
2182     n = B->cmap->n;
2183     break;
2184   case MATPRODUCT_AtB:
2185     m = A->cmap->n;
2186     n = B->cmap->n;
2187     break;
2188   case MATPRODUCT_ABt:
2189     m = A->rmap->n;
2190     n = B->rmap->n;
2191     break;
2192   case MATPRODUCT_PtAP:
2193     m = B->cmap->n;
2194     n = B->cmap->n;
2195     break;
2196   case MATPRODUCT_RARt:
2197     m = B->rmap->n;
2198     n = B->rmap->n;
2199     break;
2200   default:
2201     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2202   }
2203   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2204   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2205   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2206   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2207 
2208   /* product data */
2209   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2210   mmdata->cisdense = cisdense;
2211  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2212   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2213   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2214     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2215   }
2216  #endif
2217   /* for these products we need intermediate storage */
2218   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2219     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2220     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2221     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2222       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2223     } else {
2224       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2225     }
2226   }
2227   C->product->data    = mmdata;
2228   C->product->destroy = MatDestroy_MatMatCusparse;
2229 
2230   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2231   PetscFunctionReturn(0);
2232 }
2233 
2234 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2235 {
2236   Mat_Product                  *product = C->product;
2237   Mat                          A,B;
2238   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2239   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2240   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2241   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2242   PetscBool                    flg;
2243   PetscErrorCode               ierr;
2244   cusparseStatus_t             stat;
2245   cudaError_t                  cerr;
2246   MatProductType               ptype;
2247   MatMatCusparse               *mmdata;
2248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2249   cusparseSpMatDescr_t         BmatSpDescr;
2250 #endif
2251   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2252 
2253   PetscFunctionBegin;
2254   MatCheckProduct(C,1);
2255   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2256   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2257   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2258   mmdata = (MatMatCusparse*)C->product->data;
2259   A = product->A;
2260   B = product->B;
2261   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2262     mmdata->reusesym = PETSC_FALSE;
2263     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2264     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2265     Cmat = Ccusp->mat;
2266     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2267     Ccsr = (CsrMatrix*)Cmat->mat;
2268     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2269     goto finalize;
2270   }
2271   if (!c->nz) goto finalize;
2272   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2273   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2274   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2275   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2276   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2277   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2278   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2279   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2280   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2281   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2282   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2283   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2284   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2285   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2286 
2287   ptype = product->type;
2288   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2289   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2290   switch (ptype) {
2291   case MATPRODUCT_AB:
2292     Amat = Acusp->mat;
2293     Bmat = Bcusp->mat;
2294     break;
2295   case MATPRODUCT_AtB:
2296     Amat = Acusp->matTranspose;
2297     Bmat = Bcusp->mat;
2298     break;
2299   case MATPRODUCT_ABt:
2300     Amat = Acusp->mat;
2301     Bmat = Bcusp->matTranspose;
2302     break;
2303   default:
2304     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2305   }
2306   Cmat = Ccusp->mat;
2307   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2308   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2309   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2310   Acsr = (CsrMatrix*)Amat->mat;
2311   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2312   Ccsr = (CsrMatrix*)Cmat->mat;
2313   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2314   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2315   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2316   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2319   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2320   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2321     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2322                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2323                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2324                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2325   #else
2326     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2327                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2328                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2329                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2330     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2331                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2332                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2333   #endif
2334 #else
2335   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2336                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2337                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2338                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2339                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2340 #endif
2341   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2342   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2343   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2344   C->offloadmask = PETSC_OFFLOAD_GPU;
2345 finalize:
2346   /* shorter version of MatAssemblyEnd_SeqAIJ */
2347   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2348   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2349   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2350   c->reallocs         = 0;
2351   C->info.mallocs    += 0;
2352   C->info.nz_unneeded = 0;
2353   C->assembled = C->was_assembled = PETSC_TRUE;
2354   C->num_ass++;
2355   PetscFunctionReturn(0);
2356 }
2357 
2358 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2359 {
2360   Mat_Product                  *product = C->product;
2361   Mat                          A,B;
2362   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2363   Mat_SeqAIJ                   *a,*b,*c;
2364   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2365   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2366   PetscInt                     i,j,m,n,k;
2367   PetscBool                    flg;
2368   PetscErrorCode               ierr;
2369   cusparseStatus_t             stat;
2370   cudaError_t                  cerr;
2371   MatProductType               ptype;
2372   MatMatCusparse               *mmdata;
2373   PetscLogDouble               flops;
2374   PetscBool                    biscompressed,ciscompressed;
2375 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2376   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2377   cusparseSpMatDescr_t         BmatSpDescr;
2378 #else
2379   int                          cnz;
2380 #endif
2381   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2382 
2383   PetscFunctionBegin;
2384   MatCheckProduct(C,1);
2385   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2386   A    = product->A;
2387   B    = product->B;
2388   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2389   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2390   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2391   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2392   a = (Mat_SeqAIJ*)A->data;
2393   b = (Mat_SeqAIJ*)B->data;
2394   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2395   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2396   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2397   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2398 
2399   /* product data */
2400   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2401   C->product->data    = mmdata;
2402   C->product->destroy = MatDestroy_MatMatCusparse;
2403 
2404   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2405   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2406   ptype = product->type;
2407   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2408   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2409   biscompressed = PETSC_FALSE;
2410   ciscompressed = PETSC_FALSE;
2411   switch (ptype) {
2412   case MATPRODUCT_AB:
2413     m = A->rmap->n;
2414     n = B->cmap->n;
2415     k = A->cmap->n;
2416     Amat = Acusp->mat;
2417     Bmat = Bcusp->mat;
2418     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2419     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2420     break;
2421   case MATPRODUCT_AtB:
2422     m = A->cmap->n;
2423     n = B->cmap->n;
2424     k = A->rmap->n;
2425     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2426     Amat = Acusp->matTranspose;
2427     Bmat = Bcusp->mat;
2428     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2429     break;
2430   case MATPRODUCT_ABt:
2431     m = A->rmap->n;
2432     n = B->rmap->n;
2433     k = A->cmap->n;
2434     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2435     Amat = Acusp->mat;
2436     Bmat = Bcusp->matTranspose;
2437     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2438     break;
2439   default:
2440     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2441   }
2442 
2443   /* create cusparse matrix */
2444   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2445   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2446   c     = (Mat_SeqAIJ*)C->data;
2447   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2448   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2449   Ccsr  = new CsrMatrix;
2450 
2451   c->compressedrow.use = ciscompressed;
2452   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2453     c->compressedrow.nrows = a->compressedrow.nrows;
2454     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2455     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2456     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2457     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2458     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2459   } else {
2460     c->compressedrow.nrows  = 0;
2461     c->compressedrow.i      = NULL;
2462     c->compressedrow.rindex = NULL;
2463     Ccusp->workVector       = NULL;
2464     Cmat->cprowIndices      = NULL;
2465   }
2466   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2467   Ccusp->mat      = Cmat;
2468   Ccusp->mat->mat = Ccsr;
2469   Ccsr->num_rows    = Ccusp->nrows;
2470   Ccsr->num_cols    = n;
2471   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2472   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2473   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2474   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2475   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2476   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2477   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2478   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2479   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2480   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2481   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2482     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2483     c->nz = 0;
2484     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2485     Ccsr->values = new THRUSTARRAY(c->nz);
2486     goto finalizesym;
2487   }
2488 
2489   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2490   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2491   Acsr = (CsrMatrix*)Amat->mat;
2492   if (!biscompressed) {
2493     Bcsr = (CsrMatrix*)Bmat->mat;
2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495     BmatSpDescr = Bmat->matDescr;
2496 #endif
2497   } else { /* we need to use row offsets for the full matrix */
2498     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2499     Bcsr = new CsrMatrix;
2500     Bcsr->num_rows       = B->rmap->n;
2501     Bcsr->num_cols       = cBcsr->num_cols;
2502     Bcsr->num_entries    = cBcsr->num_entries;
2503     Bcsr->column_indices = cBcsr->column_indices;
2504     Bcsr->values         = cBcsr->values;
2505     if (!Bcusp->rowoffsets_gpu) {
2506       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2507       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2508       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2509     }
2510     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2511     mmdata->Bcsr = Bcsr;
2512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2513     if (Bcsr->num_rows && Bcsr->num_cols) {
2514       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2515                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2516                                Bcsr->values->data().get(),
2517                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2518                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2519     }
2520     BmatSpDescr = mmdata->matSpBDescr;
2521 #endif
2522   }
2523   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2524   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2525   /* precompute flops count */
2526   if (ptype == MATPRODUCT_AB) {
2527     for (i=0, flops = 0; i<A->rmap->n; i++) {
2528       const PetscInt st = a->i[i];
2529       const PetscInt en = a->i[i+1];
2530       for (j=st; j<en; j++) {
2531         const PetscInt brow = a->j[j];
2532         flops += 2.*(b->i[brow+1] - b->i[brow]);
2533       }
2534     }
2535   } else if (ptype == MATPRODUCT_AtB) {
2536     for (i=0, flops = 0; i<A->rmap->n; i++) {
2537       const PetscInt anzi = a->i[i+1] - a->i[i];
2538       const PetscInt bnzi = b->i[i+1] - b->i[i];
2539       flops += (2.*anzi)*bnzi;
2540     }
2541   } else { /* TODO */
2542     flops = 0.;
2543   }
2544 
2545   mmdata->flops = flops;
2546   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2547 
2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2550   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2551                           NULL, NULL, NULL,
2552                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2553                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2554   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2555  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2556  {
2557   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2558      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2559   */
2560   void*  dBuffer1 = NULL;
2561   void*  dBuffer2 = NULL;
2562   void*  dBuffer3 = NULL;
2563   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2564   size_t bufferSize1 = 0;
2565   size_t bufferSize2 = 0;
2566   size_t bufferSize3 = 0;
2567   size_t bufferSize4 = 0;
2568   size_t bufferSize5 = 0;
2569 
2570   /*----------------------------------------------------------------------*/
2571   /* ask bufferSize1 bytes for external memory */
2572   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2573                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2574                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2575   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2576   /* inspect the matrices A and B to understand the memory requirement for the next step */
2577   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2578                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2579                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2580 
2581   /*----------------------------------------------------------------------*/
2582   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2583                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2584                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2585   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2586   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2587   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2588   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2589                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2590                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2591   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2592   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2593 
2594   /*----------------------------------------------------------------------*/
2595   /* get matrix C non-zero entries C_nnz1 */
2596   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2597   c->nz = (PetscInt) C_nnz1;
2598   /* allocate matrix C */
2599   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2600   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2601   /* update matC with the new pointers */
2602   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2603                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2604 
2605   /*----------------------------------------------------------------------*/
2606   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2607                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2608                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2609   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2610   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2611                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2612                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2613   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2614   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2615                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2616                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2617                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2618   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2619  }
2620  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2621   size_t bufSize2;
2622   /* ask bufferSize bytes for external memory */
2623   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2624                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2625                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2626                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2627   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2628   /* inspect the matrices A and B to understand the memory requirement for the next step */
2629   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2630                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2631                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2632                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2633   /* ask bufferSize again bytes for external memory */
2634   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2635                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2636                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2637                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2638   /* The CUSPARSE documentation is not clear, nor the API
2639      We need both buffers to perform the operations properly!
2640      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2641      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2642      is stored in the descriptor! What a messy API... */
2643   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2644   /* compute the intermediate product of A * B */
2645   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2646                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2647                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2648                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2649   /* get matrix C non-zero entries C_nnz1 */
2650   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2651   c->nz = (PetscInt) C_nnz1;
2652   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2653   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2654   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2655   Ccsr->values = new THRUSTARRAY(c->nz);
2656   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2657   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2658                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2659   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2660                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2662  #endif
2663 #else
2664   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2665   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2666                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2667                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2668                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2669                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2670   c->nz = cnz;
2671   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2672   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2673   Ccsr->values = new THRUSTARRAY(c->nz);
2674   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2675 
2676   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2677   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2678      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2679      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2680   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2681                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2682                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2683                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2684                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2685 #endif
2686   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2687   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2688 finalizesym:
2689   c->singlemalloc = PETSC_FALSE;
2690   c->free_a       = PETSC_TRUE;
2691   c->free_ij      = PETSC_TRUE;
2692   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2693   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2694   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2695     PetscInt *d_i = c->i;
2696     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2697     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2698     ii   = *Ccsr->row_offsets;
2699     jj   = *Ccsr->column_indices;
2700     if (ciscompressed) d_i = c->compressedrow.i;
2701     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2702     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2703   } else {
2704     PetscInt *d_i = c->i;
2705     if (ciscompressed) d_i = c->compressedrow.i;
2706     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2707     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2708   }
2709   if (ciscompressed) { /* need to expand host row offsets */
2710     PetscInt r = 0;
2711     c->i[0] = 0;
2712     for (k = 0; k < c->compressedrow.nrows; k++) {
2713       const PetscInt next = c->compressedrow.rindex[k];
2714       const PetscInt old = c->compressedrow.i[k];
2715       for (; r < next; r++) c->i[r+1] = old;
2716     }
2717     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2718   }
2719   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2720   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2721   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2722   c->maxnz = c->nz;
2723   c->nonzerorowcnt = 0;
2724   c->rmax = 0;
2725   for (k = 0; k < m; k++) {
2726     const PetscInt nn = c->i[k+1] - c->i[k];
2727     c->ilen[k] = c->imax[k] = nn;
2728     c->nonzerorowcnt += (PetscInt)!!nn;
2729     c->rmax = PetscMax(c->rmax,nn);
2730   }
2731   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2732   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2733   Ccsr->num_entries = c->nz;
2734 
2735   C->nonzerostate++;
2736   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2737   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2738   Ccusp->nonzerostate = C->nonzerostate;
2739   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2740   C->preallocated  = PETSC_TRUE;
2741   C->assembled     = PETSC_FALSE;
2742   C->was_assembled = PETSC_FALSE;
2743   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2744     mmdata->reusesym = PETSC_TRUE;
2745     C->offloadmask   = PETSC_OFFLOAD_GPU;
2746   }
2747   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2752 
2753 /* handles sparse or dense B */
2754 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2755 {
2756   Mat_Product    *product = mat->product;
2757   PetscErrorCode ierr;
2758   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2759 
2760   PetscFunctionBegin;
2761   MatCheckProduct(mat,1);
2762   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2763   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2764     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2765   }
2766   if (product->type == MATPRODUCT_ABC) {
2767     Ciscusp = PETSC_FALSE;
2768     if (!product->C->boundtocpu) {
2769       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2770     }
2771   }
2772   if (isdense) {
2773     switch (product->type) {
2774     case MATPRODUCT_AB:
2775     case MATPRODUCT_AtB:
2776     case MATPRODUCT_ABt:
2777     case MATPRODUCT_PtAP:
2778     case MATPRODUCT_RARt:
2779      if (product->A->boundtocpu) {
2780         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2781       } else {
2782         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2783       }
2784       break;
2785     case MATPRODUCT_ABC:
2786       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2787       break;
2788     default:
2789       break;
2790     }
2791   } else if (Biscusp && Ciscusp) {
2792     switch (product->type) {
2793     case MATPRODUCT_AB:
2794     case MATPRODUCT_AtB:
2795     case MATPRODUCT_ABt:
2796       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2797       break;
2798     case MATPRODUCT_PtAP:
2799     case MATPRODUCT_RARt:
2800     case MATPRODUCT_ABC:
2801       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2802       break;
2803     default:
2804       break;
2805     }
2806   } else { /* fallback for AIJ */
2807     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2808   }
2809   PetscFunctionReturn(0);
2810 }
2811 
2812 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2813 {
2814   PetscErrorCode ierr;
2815 
2816   PetscFunctionBegin;
2817   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2818   PetscFunctionReturn(0);
2819 }
2820 
2821 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2822 {
2823   PetscErrorCode ierr;
2824 
2825   PetscFunctionBegin;
2826   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2831 {
2832   PetscErrorCode ierr;
2833 
2834   PetscFunctionBegin;
2835   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2836   PetscFunctionReturn(0);
2837 }
2838 
2839 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2840 {
2841   PetscErrorCode ierr;
2842 
2843   PetscFunctionBegin;
2844   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2845   PetscFunctionReturn(0);
2846 }
2847 
2848 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2849 {
2850   PetscErrorCode ierr;
2851 
2852   PetscFunctionBegin;
2853   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2854   PetscFunctionReturn(0);
2855 }
2856 
2857 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2858 {
2859   int i = blockIdx.x*blockDim.x + threadIdx.x;
2860   if (i < n) y[idx[i]] += x[i];
2861 }
2862 
2863 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2864 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2865 {
2866   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2867   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2868   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2869   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2870   PetscErrorCode               ierr;
2871   cusparseStatus_t             stat;
2872   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2873   PetscBool                    compressed;
2874 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2875   PetscInt                     nx,ny;
2876 #endif
2877 
2878   PetscFunctionBegin;
2879   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2880   if (!a->nonzerorowcnt) {
2881     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2882     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2883     PetscFunctionReturn(0);
2884   }
2885   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2886   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2887   if (!trans) {
2888     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2889     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2890   } else {
2891     if (herm || !A->form_explicit_transpose) {
2892       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2893       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2894     } else {
2895       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2896       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2897     }
2898   }
2899   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2900   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2901 
2902   try {
2903     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2904     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2905     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2906 
2907     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2908     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2909       /* z = A x + beta y.
2910          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2911          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2912       */
2913       xptr = xarray;
2914       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2915       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2916      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2917       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2918           allocated to accommodate different uses. So we get the length info directly from mat.
2919        */
2920       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2921         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2922         nx = mat->num_cols;
2923         ny = mat->num_rows;
2924       }
2925      #endif
2926     } else {
2927       /* z = A^T x + beta y
2928          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2929          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2930        */
2931       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2932       dptr = zarray;
2933       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2934       if (compressed) { /* Scatter x to work vector */
2935         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2936         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2937                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2938                          VecCUDAEqualsReverse());
2939       }
2940      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2941       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2942         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2943         nx = mat->num_rows;
2944         ny = mat->num_cols;
2945       }
2946      #endif
2947     }
2948 
2949     /* csr_spmv does y = alpha op(A) x + beta y */
2950     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2951      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2952       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2953       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2954         cudaError_t cerr;
2955         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2956         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2957         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2958                                 matstruct->matDescr,
2959                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2960                                 matstruct->cuSpMV[opA].vecYDescr,
2961                                 cusparse_scalartype,
2962                                 cusparsestruct->spmvAlg,
2963                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2964         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2965 
2966         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2967       } else {
2968         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2969         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2970         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2971       }
2972 
2973       stat = cusparseSpMV(cusparsestruct->handle, opA,
2974                                matstruct->alpha_one,
2975                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2976                                matstruct->cuSpMV[opA].vecXDescr,
2977                                beta,
2978                                matstruct->cuSpMV[opA].vecYDescr,
2979                                cusparse_scalartype,
2980                                cusparsestruct->spmvAlg,
2981                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2982      #else
2983       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2984       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2985                                mat->num_rows, mat->num_cols,
2986                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2987                                mat->values->data().get(), mat->row_offsets->data().get(),
2988                                mat->column_indices->data().get(), xptr, beta,
2989                                dptr);CHKERRCUSPARSE(stat);
2990      #endif
2991     } else {
2992       if (cusparsestruct->nrows) {
2993        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2994         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2995        #else
2996         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2997         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2998                                  matstruct->alpha_one, matstruct->descr, hybMat,
2999                                  xptr, beta,
3000                                  dptr);CHKERRCUSPARSE(stat);
3001        #endif
3002       }
3003     }
3004     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3005 
3006     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3007       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3008         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3009           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3010         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3011           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3012         }
3013       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3014         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3015       }
3016 
3017       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3018       if (compressed) {
3019         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3020         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3021            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3022            prevent that. So I just add a ScatterAdd kernel.
3023          */
3024        #if 0
3025         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3026         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3027                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3028                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3029                          VecCUDAPlusEquals());
3030        #else
3031         PetscInt n = matstruct->cprowIndices->size();
3032         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3033        #endif
3034         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3035       }
3036     } else {
3037       if (yy && yy != zz) {
3038         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3039       }
3040     }
3041     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3042     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3043     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3044   } catch(char *ex) {
3045     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3046   }
3047   if (yy) {
3048     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3049   } else {
3050     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3051   }
3052   PetscFunctionReturn(0);
3053 }
3054 
3055 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3056 {
3057   PetscErrorCode ierr;
3058 
3059   PetscFunctionBegin;
3060   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3061   PetscFunctionReturn(0);
3062 }
3063 
3064 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3065 {
3066   PetscErrorCode     ierr;
3067   PetscObjectState   onnz = A->nonzerostate;
3068   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3069 
3070   PetscFunctionBegin;
3071   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3072   if (onnz != A->nonzerostate && cusp->deviceMat) {
3073     cudaError_t cerr;
3074 
3075     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3076     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3077     cusp->deviceMat = NULL;
3078   }
3079   PetscFunctionReturn(0);
3080 }
3081 
3082 /* --------------------------------------------------------------------------------*/
3083 /*@
3084    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3085    (the default parallel PETSc format). This matrix will ultimately pushed down
3086    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3087    assembly performance the user should preallocate the matrix storage by setting
3088    the parameter nz (or the array nnz).  By setting these parameters accurately,
3089    performance during matrix assembly can be increased by more than a factor of 50.
3090 
3091    Collective
3092 
3093    Input Parameters:
3094 +  comm - MPI communicator, set to PETSC_COMM_SELF
3095 .  m - number of rows
3096 .  n - number of columns
3097 .  nz - number of nonzeros per row (same for all rows)
3098 -  nnz - array containing the number of nonzeros in the various rows
3099          (possibly different for each row) or NULL
3100 
3101    Output Parameter:
3102 .  A - the matrix
3103 
3104    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3105    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3106    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3107 
3108    Notes:
3109    If nnz is given then nz is ignored
3110 
3111    The AIJ format (also called the Yale sparse matrix format or
3112    compressed row storage), is fully compatible with standard Fortran 77
3113    storage.  That is, the stored row and column indices can begin at
3114    either one (as in Fortran) or zero.  See the users' manual for details.
3115 
3116    Specify the preallocated storage with either nz or nnz (not both).
3117    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3118    allocation.  For large problems you MUST preallocate memory or you
3119    will get TERRIBLE performance, see the users' manual chapter on matrices.
3120 
3121    By default, this format uses inodes (identical nodes) when possible, to
3122    improve numerical efficiency of matrix-vector products and solves. We
3123    search for consecutive rows with the same nonzero structure, thereby
3124    reusing matrix information to achieve increased efficiency.
3125 
3126    Level: intermediate
3127 
3128 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3129 @*/
3130 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3131 {
3132   PetscErrorCode ierr;
3133 
3134   PetscFunctionBegin;
3135   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3136   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3137   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3138   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3139   PetscFunctionReturn(0);
3140 }
3141 
3142 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3143 {
3144   PetscErrorCode ierr;
3145 
3146   PetscFunctionBegin;
3147   if (A->factortype == MAT_FACTOR_NONE) {
3148     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3149   } else {
3150     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3151   }
3152   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3153   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3154   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3155   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3156   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3157   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3158   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3159   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3160   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3161   PetscFunctionReturn(0);
3162 }
3163 
3164 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3165 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3166 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3167 {
3168   PetscErrorCode ierr;
3169 
3170   PetscFunctionBegin;
3171   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3172   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3173   PetscFunctionReturn(0);
3174 }
3175 
3176 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3177 {
3178   PetscErrorCode     ierr;
3179   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3180   Mat_SeqAIJCUSPARSE *cy;
3181   Mat_SeqAIJCUSPARSE *cx;
3182   PetscScalar        *ay;
3183   const PetscScalar  *ax;
3184   CsrMatrix          *csry,*csrx;
3185 
3186   PetscFunctionBegin;
3187   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3188   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3189   if (X->ops->axpy != Y->ops->axpy) {
3190     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3191     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3192     PetscFunctionReturn(0);
3193   }
3194   /* if we are here, it means both matrices are bound to GPU */
3195   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3196   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3197   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3198   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3199   csry = (CsrMatrix*)cy->mat->mat;
3200   csrx = (CsrMatrix*)cx->mat->mat;
3201   /* see if we can turn this into a cublas axpy */
3202   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3203     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3204     if (eq) {
3205       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3206     }
3207     if (eq) str = SAME_NONZERO_PATTERN;
3208   }
3209   /* spgeam is buggy with one column */
3210   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3211 
3212   if (str == SUBSET_NONZERO_PATTERN) {
3213     cusparseStatus_t stat;
3214     PetscScalar      b = 1.0;
3215 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3216     size_t           bufferSize;
3217     void             *buffer;
3218     cudaError_t      cerr;
3219 #endif
3220 
3221     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3222     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3223     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3224 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3225     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3226                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3227                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3228                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3229     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3230     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3231     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3232                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3233                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3234                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3235     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3236     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3237     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3238 #else
3239     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3240     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3241                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3242                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3243                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3244     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3245     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3246 #endif
3247     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3248     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3249     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3250     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3251   } else if (str == SAME_NONZERO_PATTERN) {
3252     cublasHandle_t cublasv2handle;
3253     cublasStatus_t berr;
3254     PetscBLASInt   one = 1, bnz = 1;
3255 
3256     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3257     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3258     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3259     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3260     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3261     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3262     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3263     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3264     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3265     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3266     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3267   } else {
3268     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3269     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3270   }
3271   PetscFunctionReturn(0);
3272 }
3273 
3274 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3275 {
3276   PetscErrorCode ierr;
3277   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3278   PetscScalar    *ay;
3279   cublasHandle_t cublasv2handle;
3280   cublasStatus_t berr;
3281   PetscBLASInt   one = 1, bnz = 1;
3282 
3283   PetscFunctionBegin;
3284   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3285   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3286   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3287   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3288   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3289   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3290   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3291   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3292   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3293   PetscFunctionReturn(0);
3294 }
3295 
3296 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3297 {
3298   PetscErrorCode ierr;
3299   PetscBool      both = PETSC_FALSE;
3300   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3301 
3302   PetscFunctionBegin;
3303   if (A->factortype == MAT_FACTOR_NONE) {
3304     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3305     if (spptr->mat) {
3306       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3307       if (matrix->values) {
3308         both = PETSC_TRUE;
3309         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3310       }
3311     }
3312     if (spptr->matTranspose) {
3313       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3314       if (matrix->values) {
3315         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3316       }
3317     }
3318   }
3319   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3320   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3321   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3322   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3323   else A->offloadmask = PETSC_OFFLOAD_CPU;
3324   PetscFunctionReturn(0);
3325 }
3326 
3327 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3328 {
3329   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3330   PetscErrorCode ierr;
3331 
3332   PetscFunctionBegin;
3333   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3334   if (flg) {
3335     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3336 
3337     A->ops->scale                     = MatScale_SeqAIJ;
3338     A->ops->axpy                      = MatAXPY_SeqAIJ;
3339     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3340     A->ops->mult                      = MatMult_SeqAIJ;
3341     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3342     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3343     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3344     A->ops->multhermitiantranspose    = NULL;
3345     A->ops->multhermitiantransposeadd = NULL;
3346     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3347     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3348     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3349     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3350     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3351     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3352     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3353     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3354   } else {
3355     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3356     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3357     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3358     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3359     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3360     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3361     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3362     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3363     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3364     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3365     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3366     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3367     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3368     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3369     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3370     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3371     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3372   }
3373   A->boundtocpu = flg;
3374   a->inode.use = flg;
3375   PetscFunctionReturn(0);
3376 }
3377 
3378 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3379 {
3380   PetscErrorCode   ierr;
3381   cusparseStatus_t stat;
3382   Mat              B;
3383 
3384   PetscFunctionBegin;
3385   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3386   if (reuse == MAT_INITIAL_MATRIX) {
3387     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3388   } else if (reuse == MAT_REUSE_MATRIX) {
3389     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3390   }
3391   B = *newmat;
3392 
3393   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3394   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3395 
3396   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3397     if (B->factortype == MAT_FACTOR_NONE) {
3398       Mat_SeqAIJCUSPARSE *spptr;
3399       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3400       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3401       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3402       spptr->format     = MAT_CUSPARSE_CSR;
3403      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3404       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3405       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3406       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3407      #endif
3408       B->spptr = spptr;
3409     } else {
3410       Mat_SeqAIJCUSPARSETriFactors *spptr;
3411 
3412       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3413       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3414       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3415       B->spptr = spptr;
3416     }
3417     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3418   }
3419   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3420   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3421   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3422   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3423   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3424   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3425 
3426   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3427   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3428   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3429   PetscFunctionReturn(0);
3430 }
3431 
3432 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3433 {
3434   PetscErrorCode ierr;
3435 
3436   PetscFunctionBegin;
3437   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3438   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3439   PetscFunctionReturn(0);
3440 }
3441 
3442 /*MC
3443    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3444 
3445    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3446    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3447    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3448 
3449    Options Database Keys:
3450 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3451 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3452 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3453 
3454   Level: beginner
3455 
3456 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3457 M*/
3458 
3459 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3460 
3461 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3462 {
3463   PetscErrorCode ierr;
3464 
3465   PetscFunctionBegin;
3466   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3467   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3468   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3469   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3470   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3471 
3472   PetscFunctionReturn(0);
3473 }
3474 
3475 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3476 {
3477   PetscErrorCode   ierr;
3478   cusparseStatus_t stat;
3479 
3480   PetscFunctionBegin;
3481   if (*cusparsestruct) {
3482     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3483     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3484     delete (*cusparsestruct)->workVector;
3485     delete (*cusparsestruct)->rowoffsets_gpu;
3486     delete (*cusparsestruct)->cooPerm;
3487     delete (*cusparsestruct)->cooPerm_a;
3488     delete (*cusparsestruct)->csr2csc_i;
3489     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3490     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3491   }
3492   PetscFunctionReturn(0);
3493 }
3494 
3495 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3496 {
3497   PetscFunctionBegin;
3498   if (*mat) {
3499     delete (*mat)->values;
3500     delete (*mat)->column_indices;
3501     delete (*mat)->row_offsets;
3502     delete *mat;
3503     *mat = 0;
3504   }
3505   PetscFunctionReturn(0);
3506 }
3507 
3508 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3509 {
3510   cusparseStatus_t stat;
3511   PetscErrorCode   ierr;
3512 
3513   PetscFunctionBegin;
3514   if (*trifactor) {
3515     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3516     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3517     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3518     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3519     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3520    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3521     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3522    #endif
3523     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3524   }
3525   PetscFunctionReturn(0);
3526 }
3527 
3528 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3529 {
3530   CsrMatrix        *mat;
3531   cusparseStatus_t stat;
3532   cudaError_t      err;
3533 
3534   PetscFunctionBegin;
3535   if (*matstruct) {
3536     if ((*matstruct)->mat) {
3537       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3538        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3539         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3540        #else
3541         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3542         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3543        #endif
3544       } else {
3545         mat = (CsrMatrix*)(*matstruct)->mat;
3546         CsrMatrix_Destroy(&mat);
3547       }
3548     }
3549     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3550     delete (*matstruct)->cprowIndices;
3551     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3552     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3553     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3554 
3555    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3556     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3557     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3558     for (int i=0; i<3; i++) {
3559       if (mdata->cuSpMV[i].initialized) {
3560         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3561         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3562         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3563       }
3564     }
3565    #endif
3566     delete *matstruct;
3567     *matstruct = NULL;
3568   }
3569   PetscFunctionReturn(0);
3570 }
3571 
3572 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3573 {
3574   PetscErrorCode ierr;
3575 
3576   PetscFunctionBegin;
3577   if (*trifactors) {
3578     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3579     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3580     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3581     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3582     delete (*trifactors)->rpermIndices;
3583     delete (*trifactors)->cpermIndices;
3584     delete (*trifactors)->workVector;
3585     (*trifactors)->rpermIndices = NULL;
3586     (*trifactors)->cpermIndices = NULL;
3587     (*trifactors)->workVector = NULL;
3588     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3589     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3590     (*trifactors)->init_dev_prop = PETSC_FALSE;
3591   }
3592   PetscFunctionReturn(0);
3593 }
3594 
3595 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3596 {
3597   PetscErrorCode   ierr;
3598   cusparseHandle_t handle;
3599   cusparseStatus_t stat;
3600 
3601   PetscFunctionBegin;
3602   if (*trifactors) {
3603     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3604     if (handle = (*trifactors)->handle) {
3605       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3606     }
3607     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3608   }
3609   PetscFunctionReturn(0);
3610 }
3611 
3612 struct IJCompare
3613 {
3614   __host__ __device__
3615   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3616   {
3617     if (t1.get<0>() < t2.get<0>()) return true;
3618     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3619     return false;
3620   }
3621 };
3622 
3623 struct IJEqual
3624 {
3625   __host__ __device__
3626   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3627   {
3628     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3629     return true;
3630   }
3631 };
3632 
3633 struct IJDiff
3634 {
3635   __host__ __device__
3636   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3637   {
3638     return t1 == t2 ? 0 : 1;
3639   }
3640 };
3641 
3642 struct IJSum
3643 {
3644   __host__ __device__
3645   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3646   {
3647     return t1||t2;
3648   }
3649 };
3650 
3651 #include <thrust/iterator/discard_iterator.h>
3652 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3653 {
3654   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3655   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3656   THRUSTARRAY                           *cooPerm_v = NULL;
3657   thrust::device_ptr<const PetscScalar> d_v;
3658   CsrMatrix                             *matrix;
3659   PetscErrorCode                        ierr;
3660   PetscInt                              n;
3661 
3662   PetscFunctionBegin;
3663   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3664   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3665   if (!cusp->cooPerm) {
3666     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3667     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3668     PetscFunctionReturn(0);
3669   }
3670   matrix = (CsrMatrix*)cusp->mat->mat;
3671   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3672   if (!v) {
3673     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3674     goto finalize;
3675   }
3676   n = cusp->cooPerm->size();
3677   if (isCudaMem(v)) {
3678     d_v = thrust::device_pointer_cast(v);
3679   } else {
3680     cooPerm_v = new THRUSTARRAY(n);
3681     cooPerm_v->assign(v,v+n);
3682     d_v = cooPerm_v->data();
3683     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3684   }
3685   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3686   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3687     if (cusp->cooPerm_a) {
3688       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3689       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3690       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3691       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3692       delete cooPerm_w;
3693     } else {
3694       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3695                                                                 matrix->values->begin()));
3696       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3697                                                                 matrix->values->end()));
3698       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3699     }
3700   } else {
3701     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3702       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3703       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3704     } else {
3705       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3706                                                                 matrix->values->begin()));
3707       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3708                                                                 matrix->values->end()));
3709       thrust::for_each(zibit,zieit,VecCUDAEquals());
3710     }
3711   }
3712   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3713 finalize:
3714   delete cooPerm_v;
3715   A->offloadmask = PETSC_OFFLOAD_GPU;
3716   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3717   /* shorter version of MatAssemblyEnd_SeqAIJ */
3718   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3719   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3720   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3721   a->reallocs         = 0;
3722   A->info.mallocs    += 0;
3723   A->info.nz_unneeded = 0;
3724   A->assembled = A->was_assembled = PETSC_TRUE;
3725   A->num_ass++;
3726   PetscFunctionReturn(0);
3727 }
3728 
3729 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3730 {
3731   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3732   PetscErrorCode     ierr;
3733 
3734   PetscFunctionBegin;
3735   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3736   if (!cusp) PetscFunctionReturn(0);
3737   if (destroy) {
3738     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3739     delete cusp->csr2csc_i;
3740     cusp->csr2csc_i = NULL;
3741   }
3742   A->transupdated = PETSC_FALSE;
3743   PetscFunctionReturn(0);
3744 }
3745 
3746 #include <thrust/binary_search.h>
3747 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3748 {
3749   PetscErrorCode     ierr;
3750   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3751   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3752   PetscInt           cooPerm_n, nzr = 0;
3753   cudaError_t        cerr;
3754 
3755   PetscFunctionBegin;
3756   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3757   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3758   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3759   if (n != cooPerm_n) {
3760     delete cusp->cooPerm;
3761     delete cusp->cooPerm_a;
3762     cusp->cooPerm = NULL;
3763     cusp->cooPerm_a = NULL;
3764   }
3765   if (n) {
3766     THRUSTINTARRAY d_i(n);
3767     THRUSTINTARRAY d_j(n);
3768     THRUSTINTARRAY ii(A->rmap->n);
3769 
3770     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3771     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3772 
3773     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3774     d_i.assign(coo_i,coo_i+n);
3775     d_j.assign(coo_j,coo_j+n);
3776     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3777     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3778 
3779     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3780     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3781     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3782     *cusp->cooPerm_a = d_i;
3783     THRUSTINTARRAY w = d_j;
3784 
3785     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3786     if (nekey == ekey) { /* all entries are unique */
3787       delete cusp->cooPerm_a;
3788       cusp->cooPerm_a = NULL;
3789     } else { /* I couldn't come up with a more elegant algorithm */
3790       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3791       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3792       (*cusp->cooPerm_a)[0] = 0;
3793       w[0] = 0;
3794       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3795       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3796     }
3797     thrust::counting_iterator<PetscInt> search_begin(0);
3798     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3799                         search_begin, search_begin + A->rmap->n,
3800                         ii.begin());
3801     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3802 
3803     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3804     a->singlemalloc = PETSC_FALSE;
3805     a->free_a       = PETSC_TRUE;
3806     a->free_ij      = PETSC_TRUE;
3807     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3808     a->i[0] = 0;
3809     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3810     a->nz = a->maxnz = a->i[A->rmap->n];
3811     a->rmax = 0;
3812     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3813     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3814     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3815     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3816     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3817     for (PetscInt i = 0; i < A->rmap->n; i++) {
3818       const PetscInt nnzr = a->i[i+1] - a->i[i];
3819       nzr += (PetscInt)!!(nnzr);
3820       a->ilen[i] = a->imax[i] = nnzr;
3821       a->rmax = PetscMax(a->rmax,nnzr);
3822     }
3823     a->nonzerorowcnt = nzr;
3824     A->preallocated = PETSC_TRUE;
3825     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3826     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3827   } else {
3828     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3829   }
3830   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3831 
3832   /* We want to allocate the CUSPARSE struct for matvec now.
3833      The code is so convoluted now that I prefer to copy zeros */
3834   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3835   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3836   A->offloadmask = PETSC_OFFLOAD_CPU;
3837   A->nonzerostate++;
3838   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3839   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3840 
3841   A->assembled = PETSC_FALSE;
3842   A->was_assembled = PETSC_FALSE;
3843   PetscFunctionReturn(0);
3844 }
3845 
3846 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3847 {
3848   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3849   CsrMatrix          *csr;
3850   PetscErrorCode     ierr;
3851 
3852   PetscFunctionBegin;
3853   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3854   PetscValidPointer(a,2);
3855   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3856   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3857   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3858   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3859   csr = (CsrMatrix*)cusp->mat->mat;
3860   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3861   *a = csr->values->data().get();
3862   PetscFunctionReturn(0);
3863 }
3864 
3865 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3866 {
3867   PetscFunctionBegin;
3868   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3869   PetscValidPointer(a,2);
3870   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3871   *a = NULL;
3872   PetscFunctionReturn(0);
3873 }
3874 
3875 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3876 {
3877   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3878   CsrMatrix          *csr;
3879   PetscErrorCode     ierr;
3880 
3881   PetscFunctionBegin;
3882   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3883   PetscValidPointer(a,2);
3884   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3885   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3886   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3887   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3888   csr = (CsrMatrix*)cusp->mat->mat;
3889   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3890   *a = csr->values->data().get();
3891   A->offloadmask = PETSC_OFFLOAD_GPU;
3892   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3893   PetscFunctionReturn(0);
3894 }
3895 
3896 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3897 {
3898   PetscErrorCode ierr;
3899 
3900   PetscFunctionBegin;
3901   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3902   PetscValidPointer(a,2);
3903   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3904   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3905   *a = NULL;
3906   PetscFunctionReturn(0);
3907 }
3908 
3909 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3910 {
3911   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3912   CsrMatrix          *csr;
3913   PetscErrorCode     ierr;
3914 
3915   PetscFunctionBegin;
3916   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3917   PetscValidPointer(a,2);
3918   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3919   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3920   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3921   csr = (CsrMatrix*)cusp->mat->mat;
3922   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3923   *a = csr->values->data().get();
3924   A->offloadmask = PETSC_OFFLOAD_GPU;
3925   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3926   PetscFunctionReturn(0);
3927 }
3928 
3929 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3930 {
3931   PetscErrorCode ierr;
3932 
3933   PetscFunctionBegin;
3934   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3935   PetscValidPointer(a,2);
3936   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3937   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3938   *a = NULL;
3939   PetscFunctionReturn(0);
3940 }
3941 
3942 struct IJCompare4
3943 {
3944   __host__ __device__
3945   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3946   {
3947     if (t1.get<0>() < t2.get<0>()) return true;
3948     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3949     return false;
3950   }
3951 };
3952 
3953 struct Shift
3954 {
3955   int _shift;
3956 
3957   Shift(int shift) : _shift(shift) {}
3958   __host__ __device__
3959   inline int operator() (const int &c)
3960   {
3961     return c + _shift;
3962   }
3963 };
3964 
3965 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3966 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3967 {
3968   PetscErrorCode               ierr;
3969   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3970   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3971   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3972   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3973   PetscInt                     Annz,Bnnz;
3974   cusparseStatus_t             stat;
3975   PetscInt                     i,m,n,zero = 0;
3976   cudaError_t                  cerr;
3977 
3978   PetscFunctionBegin;
3979   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3980   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3981   PetscValidPointer(C,4);
3982   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3983   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3984   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3985   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3986   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3987   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3988   if (reuse == MAT_INITIAL_MATRIX) {
3989     m     = A->rmap->n;
3990     n     = A->cmap->n + B->cmap->n;
3991     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3992     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3993     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3994     c     = (Mat_SeqAIJ*)(*C)->data;
3995     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3996     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3997     Ccsr  = new CsrMatrix;
3998     Cmat->cprowIndices      = NULL;
3999     c->compressedrow.use    = PETSC_FALSE;
4000     c->compressedrow.nrows  = 0;
4001     c->compressedrow.i      = NULL;
4002     c->compressedrow.rindex = NULL;
4003     Ccusp->workVector       = NULL;
4004     Ccusp->nrows    = m;
4005     Ccusp->mat      = Cmat;
4006     Ccusp->mat->mat = Ccsr;
4007     Ccsr->num_rows  = m;
4008     Ccsr->num_cols  = n;
4009     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4010     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4011     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4012     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4013     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4014     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4015     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4016     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4017     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4018     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4019     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4020     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
4021     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
4022     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4023     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4024 
4025     Acsr = (CsrMatrix*)Acusp->mat->mat;
4026     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4027     Annz = (PetscInt)Acsr->column_indices->size();
4028     Bnnz = (PetscInt)Bcsr->column_indices->size();
4029     c->nz = Annz + Bnnz;
4030     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4031     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4032     Ccsr->values = new THRUSTARRAY(c->nz);
4033     Ccsr->num_entries = c->nz;
4034     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4035     if (c->nz) {
4036       auto Acoo = new THRUSTINTARRAY32(Annz);
4037       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4038       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4039       THRUSTINTARRAY32 *Aroff,*Broff;
4040 
4041       if (a->compressedrow.use) { /* need full row offset */
4042         if (!Acusp->rowoffsets_gpu) {
4043           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4044           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4045           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4046         }
4047         Aroff = Acusp->rowoffsets_gpu;
4048       } else Aroff = Acsr->row_offsets;
4049       if (b->compressedrow.use) { /* need full row offset */
4050         if (!Bcusp->rowoffsets_gpu) {
4051           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4052           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4053           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4054         }
4055         Broff = Bcusp->rowoffsets_gpu;
4056       } else Broff = Bcsr->row_offsets;
4057       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4058       stat = cusparseXcsr2coo(Acusp->handle,
4059                               Aroff->data().get(),
4060                               Annz,
4061                               m,
4062                               Acoo->data().get(),
4063                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4064       stat = cusparseXcsr2coo(Bcusp->handle,
4065                               Broff->data().get(),
4066                               Bnnz,
4067                               m,
4068                               Bcoo->data().get(),
4069                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4070       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4071       auto Aperm = thrust::make_constant_iterator(1);
4072       auto Bperm = thrust::make_constant_iterator(0);
4073 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4074       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4075       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4076 #else
4077       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4078       auto Bcib = Bcsr->column_indices->begin();
4079       auto Bcie = Bcsr->column_indices->end();
4080       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4081 #endif
4082       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4083       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4084       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4085       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4086       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4087       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4088       auto p1 = Ccusp->cooPerm->begin();
4089       auto p2 = Ccusp->cooPerm->begin();
4090       thrust::advance(p2,Annz);
4091       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4092 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4093       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4094 #endif
4095       auto cci = thrust::make_counting_iterator(zero);
4096       auto cce = thrust::make_counting_iterator(c->nz);
4097 #if 0 //Errors on SUMMIT cuda 11.1.0
4098       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4099 #else
4100       auto pred = thrust::identity<int>();
4101       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4102       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4103 #endif
4104       stat = cusparseXcoo2csr(Ccusp->handle,
4105                               Ccoo->data().get(),
4106                               c->nz,
4107                               m,
4108                               Ccsr->row_offsets->data().get(),
4109                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4110       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4111       delete wPerm;
4112       delete Acoo;
4113       delete Bcoo;
4114       delete Ccoo;
4115 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4116       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4117                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4118                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4119                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4120 #endif
4121       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4122         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4123         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4124         CsrMatrix *CcsrT = new CsrMatrix;
4125         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4126         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4127 
4128         (*C)->form_explicit_transpose = PETSC_TRUE;
4129         (*C)->transupdated = PETSC_TRUE;
4130         Ccusp->rowoffsets_gpu = NULL;
4131         CmatT->cprowIndices = NULL;
4132         CmatT->mat = CcsrT;
4133         CcsrT->num_rows = n;
4134         CcsrT->num_cols = m;
4135         CcsrT->num_entries = c->nz;
4136 
4137         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4138         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4139         CcsrT->values = new THRUSTARRAY(c->nz);
4140 
4141         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4142         auto rT = CcsrT->row_offsets->begin();
4143         if (AT) {
4144           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4145           thrust::advance(rT,-1);
4146         }
4147         if (BT) {
4148           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4149           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4150           thrust::copy(titb,tite,rT);
4151         }
4152         auto cT = CcsrT->column_indices->begin();
4153         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4154         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4155         auto vT = CcsrT->values->begin();
4156         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4157         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4158         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4159 
4160         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4161         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4162         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4163         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4164         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4165         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4166         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4167         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4168         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4169 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4170         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4171                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4172                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4173                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4174 #endif
4175         Ccusp->matTranspose = CmatT;
4176       }
4177     }
4178 
4179     c->singlemalloc = PETSC_FALSE;
4180     c->free_a       = PETSC_TRUE;
4181     c->free_ij      = PETSC_TRUE;
4182     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4183     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4184     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4185       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4186       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4187       ii   = *Ccsr->row_offsets;
4188       jj   = *Ccsr->column_indices;
4189       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4190       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4191     } else {
4192       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4193       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4194     }
4195     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4196     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4197     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4198     c->maxnz = c->nz;
4199     c->nonzerorowcnt = 0;
4200     c->rmax = 0;
4201     for (i = 0; i < m; i++) {
4202       const PetscInt nn = c->i[i+1] - c->i[i];
4203       c->ilen[i] = c->imax[i] = nn;
4204       c->nonzerorowcnt += (PetscInt)!!nn;
4205       c->rmax = PetscMax(c->rmax,nn);
4206     }
4207     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4208     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4209     (*C)->nonzerostate++;
4210     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4211     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4212     Ccusp->nonzerostate = (*C)->nonzerostate;
4213     (*C)->preallocated  = PETSC_TRUE;
4214   } else {
4215     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4216     c = (Mat_SeqAIJ*)(*C)->data;
4217     if (c->nz) {
4218       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4219       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4220       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4221       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4222       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4223       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4224       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4225       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4226       Acsr = (CsrMatrix*)Acusp->mat->mat;
4227       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4228       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4229       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4230       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4231       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4232       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4233       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4234       auto pmid = Ccusp->cooPerm->begin();
4235       thrust::advance(pmid,Acsr->num_entries);
4236       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4237       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4238                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4239       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4240                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4241       thrust::for_each(zibait,zieait,VecCUDAEquals());
4242       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4243                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4244       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4245                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4246       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4247       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4248       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4249         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4250         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4251         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4252         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4253         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4254         auto vT = CcsrT->values->begin();
4255         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4256         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4257         (*C)->transupdated = PETSC_TRUE;
4258       }
4259       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4260     }
4261   }
4262   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4263   (*C)->assembled     = PETSC_TRUE;
4264   (*C)->was_assembled = PETSC_FALSE;
4265   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4266   PetscFunctionReturn(0);
4267 }
4268 
4269 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4270 {
4271   PetscErrorCode    ierr;
4272   bool              dmem;
4273   const PetscScalar *av;
4274   cudaError_t       cerr;
4275 
4276   PetscFunctionBegin;
4277   dmem = isCudaMem(v);
4278   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4279   if (n && idx) {
4280     THRUSTINTARRAY widx(n);
4281     widx.assign(idx,idx+n);
4282     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4283 
4284     THRUSTARRAY *w = NULL;
4285     thrust::device_ptr<PetscScalar> dv;
4286     if (dmem) {
4287       dv = thrust::device_pointer_cast(v);
4288     } else {
4289       w = new THRUSTARRAY(n);
4290       dv = w->data();
4291     }
4292     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4293 
4294     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4295     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4296     thrust::for_each(zibit,zieit,VecCUDAEquals());
4297     if (w) {
4298       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4299     }
4300     delete w;
4301   } else {
4302     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4303   }
4304   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4305   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4306   PetscFunctionReturn(0);
4307 }
4308