xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 82a78a4ef1c3dde9953e002d5a85008393775538)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
91 
92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93 
94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95 {
96   cusparseStatus_t   stat;
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   cusparseStatus_t   stat;
109   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110 
111   PetscFunctionBegin;
112   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
113   if (cusparsestruct->handle != handle) {
114     if (cusparsestruct->handle) {
115       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
116     }
117     cusparsestruct->handle = handle;
118   }
119   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120   PetscFunctionReturn(0);
121 }
122 
123 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126   PetscBool          flg;
127   PetscErrorCode     ierr;
128 
129   PetscFunctionBegin;
130   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
131   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133   PetscFunctionReturn(0);
134 }
135 
136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
137 {
138   PetscFunctionBegin;
139   *type = MATSOLVERCUSPARSE;
140   PetscFunctionReturn(0);
141 }
142 
143 /*MC
144   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149   algorithms are not recommended. This class does NOT support direct solver operations.
150 
151   Level: beginner
152 
153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154 M*/
155 
156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
157 {
158   PetscErrorCode ierr;
159   PetscInt       n = A->rmap->n;
160 
161   PetscFunctionBegin;
162   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
164   (*B)->factortype = ftype;
165   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
166 
167   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
168   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
169     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
170     if (!A->boundtocpu) {
171       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
172       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
173     } else {
174       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
175       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
176     }
177     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
178     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
179     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
180   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
181     if (!A->boundtocpu) {
182       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
183       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
184     } else {
185       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
186       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
187     }
188     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
189     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
190   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
191 
192   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
193   (*B)->canuseordering = PETSC_TRUE;
194   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
195   PetscFunctionReturn(0);
196 }
197 
198 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
199 {
200   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
201 
202   PetscFunctionBegin;
203   switch (op) {
204   case MAT_CUSPARSE_MULT:
205     cusparsestruct->format = format;
206     break;
207   case MAT_CUSPARSE_ALL:
208     cusparsestruct->format = format;
209     break;
210   default:
211     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
212   }
213   PetscFunctionReturn(0);
214 }
215 
216 /*@
217    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
218    operation. Only the MatMult operation can use different GPU storage formats
219    for MPIAIJCUSPARSE matrices.
220    Not Collective
221 
222    Input Parameters:
223 +  A - Matrix of type SEQAIJCUSPARSE
224 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
225 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
226 
227    Output Parameter:
228 
229    Level: intermediate
230 
231 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
232 @*/
233 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
234 {
235   PetscErrorCode ierr;
236 
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
239   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
244 {
245   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
246 
247   PetscFunctionBegin;
248   cusparsestruct->use_cpu_solve = use_cpu;
249   PetscFunctionReturn(0);
250 }
251 
252 /*@
253    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
254 
255    Input Parameters:
256 +  A - Matrix of type SEQAIJCUSPARSE
257 -  use_cpu - set flag for using the built-in CPU MatSolve
258 
259    Output Parameter:
260 
261    Notes:
262    The cuSparse LU solver currently computes the factors with the built-in CPU method
263    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
264    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
265 
266    Level: intermediate
267 
268 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
269 @*/
270 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
271 {
272   PetscErrorCode ierr;
273 
274   PetscFunctionBegin;
275   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
276   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
281 {
282   PetscErrorCode ierr;
283 
284   PetscFunctionBegin;
285   switch (op) {
286     case MAT_FORM_EXPLICIT_TRANSPOSE:
287       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
288       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
289       A->form_explicit_transpose = flg;
290       break;
291     default:
292       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
293       break;
294   }
295   PetscFunctionReturn(0);
296 }
297 
298 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
299 
300 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
301 {
302   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
303   IS             isrow = b->row,iscol = b->col;
304   PetscBool      row_identity,col_identity;
305   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
306   PetscErrorCode ierr;
307 
308   PetscFunctionBegin;
309   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
310   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
311   B->offloadmask = PETSC_OFFLOAD_CPU;
312   /* determine which version of MatSolve needs to be used. */
313   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
314   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
315   if (row_identity && col_identity) {
316     if (!cusparsestruct->use_cpu_solve) {
317       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
318       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
319     }
320     B->ops->matsolve = NULL;
321     B->ops->matsolvetranspose = NULL;
322   } else {
323     if (!cusparsestruct->use_cpu_solve) {
324       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
325       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
326     }
327     B->ops->matsolve = NULL;
328     B->ops->matsolvetranspose = NULL;
329   }
330 
331   /* get the triangular factors */
332   if (!cusparsestruct->use_cpu_solve) {
333     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
334   }
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
339 {
340   PetscErrorCode           ierr;
341   MatCUSPARSEStorageFormat format;
342   PetscBool                flg;
343   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
344 
345   PetscFunctionBegin;
346   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
347   if (A->factortype == MAT_FACTOR_NONE) {
348     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
349                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
350     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
351 
352     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
353                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
354     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
355     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
356     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
358     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
359                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
360     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
361 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
362     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
363 #else
364     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
365 #endif
366     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
367                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
368     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
369 
370     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
371                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
372     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
373    #endif
374   }
375   ierr = PetscOptionsTail();CHKERRQ(ierr);
376   PetscFunctionReturn(0);
377 }
378 
379 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
380 {
381   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
382   PetscErrorCode               ierr;
383 
384   PetscFunctionBegin;
385   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
386   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
387   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
388   PetscFunctionReturn(0);
389 }
390 
391 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
392 {
393   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
394   PetscErrorCode               ierr;
395 
396   PetscFunctionBegin;
397   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
398   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
399   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
400   PetscFunctionReturn(0);
401 }
402 
403 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
404 {
405   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
406   PetscErrorCode               ierr;
407 
408   PetscFunctionBegin;
409   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
410   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
411   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
412   PetscFunctionReturn(0);
413 }
414 
415 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
416 {
417   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
418   PetscErrorCode               ierr;
419 
420   PetscFunctionBegin;
421   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
422   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
423   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
424   PetscFunctionReturn(0);
425 }
426 
427 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
428 {
429   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
430   PetscInt                          n = A->rmap->n;
431   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
432   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
433   cusparseStatus_t                  stat;
434   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
435   const MatScalar                   *aa = a->a,*v;
436   PetscInt                          *AiLo, *AjLo;
437   PetscInt                          i,nz, nzLower, offset, rowOffset;
438   PetscErrorCode                    ierr;
439   cudaError_t                       cerr;
440 
441   PetscFunctionBegin;
442   if (!n) PetscFunctionReturn(0);
443   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
444     try {
445       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
446       nzLower=n+ai[n]-ai[1];
447       if (!loTriFactor) {
448         PetscScalar                       *AALo;
449 
450         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
451 
452         /* Allocate Space for the lower triangular matrix */
453         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
454         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
455 
456         /* Fill the lower triangular matrix */
457         AiLo[0]  = (PetscInt) 0;
458         AiLo[n]  = nzLower;
459         AjLo[0]  = (PetscInt) 0;
460         AALo[0]  = (MatScalar) 1.0;
461         v        = aa;
462         vi       = aj;
463         offset   = 1;
464         rowOffset= 1;
465         for (i=1; i<n; i++) {
466           nz = ai[i+1] - ai[i];
467           /* additional 1 for the term on the diagonal */
468           AiLo[i]    = rowOffset;
469           rowOffset += nz+1;
470 
471           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
472           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
473 
474           offset      += nz;
475           AjLo[offset] = (PetscInt) i;
476           AALo[offset] = (MatScalar) 1.0;
477           offset      += 1;
478 
479           v  += nz;
480           vi += nz;
481         }
482 
483         /* allocate space for the triangular factor information */
484         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
485         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
486         /* Create the matrix description */
487         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
488         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
489        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
490         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
491        #else
492         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
493        #endif
494         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
495         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
496 
497         /* set the operation */
498         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
499 
500         /* set the matrix */
501         loTriFactor->csrMat = new CsrMatrix;
502         loTriFactor->csrMat->num_rows = n;
503         loTriFactor->csrMat->num_cols = n;
504         loTriFactor->csrMat->num_entries = nzLower;
505 
506         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
507         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
508 
509         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
510         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
511 
512         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
513         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
514 
515         /* Create the solve analysis information */
516         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
517         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
518       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
519         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
520                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
521                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
522                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
523                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
524         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
525       #endif
526 
527         /* perform the solve analysis */
528         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
529                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
530                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
531                                  loTriFactor->csrMat->column_indices->data().get(),
532                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
533                                  loTriFactor->solveInfo,
534                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
535                                #else
536                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
537                                #endif
538         cerr = WaitForCUDA();CHKERRCUDA(cerr);
539         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
540 
541         /* assign the pointer */
542         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
543         loTriFactor->AA_h = AALo;
544         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
545         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
546         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
547       } else { /* update values only */
548         if (!loTriFactor->AA_h) {
549           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
550         }
551         /* Fill the lower triangular matrix */
552         loTriFactor->AA_h[0]  = 1.0;
553         v        = aa;
554         vi       = aj;
555         offset   = 1;
556         for (i=1; i<n; i++) {
557           nz = ai[i+1] - ai[i];
558           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
559           offset      += nz;
560           loTriFactor->AA_h[offset] = 1.0;
561           offset      += 1;
562           v  += nz;
563         }
564         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
565         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
566       }
567     } catch(char *ex) {
568       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
569     }
570   }
571   PetscFunctionReturn(0);
572 }
573 
574 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
575 {
576   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
577   PetscInt                          n = A->rmap->n;
578   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
579   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
580   cusparseStatus_t                  stat;
581   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
582   const MatScalar                   *aa = a->a,*v;
583   PetscInt                          *AiUp, *AjUp;
584   PetscInt                          i,nz, nzUpper, offset;
585   PetscErrorCode                    ierr;
586   cudaError_t                       cerr;
587 
588   PetscFunctionBegin;
589   if (!n) PetscFunctionReturn(0);
590   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
591     try {
592       /* next, figure out the number of nonzeros in the upper triangular matrix. */
593       nzUpper = adiag[0]-adiag[n];
594       if (!upTriFactor) {
595         PetscScalar *AAUp;
596 
597         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
598 
599         /* Allocate Space for the upper triangular matrix */
600         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
601         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
602 
603         /* Fill the upper triangular matrix */
604         AiUp[0]=(PetscInt) 0;
605         AiUp[n]=nzUpper;
606         offset = nzUpper;
607         for (i=n-1; i>=0; i--) {
608           v  = aa + adiag[i+1] + 1;
609           vi = aj + adiag[i+1] + 1;
610 
611           /* number of elements NOT on the diagonal */
612           nz = adiag[i] - adiag[i+1]-1;
613 
614           /* decrement the offset */
615           offset -= (nz+1);
616 
617           /* first, set the diagonal elements */
618           AjUp[offset] = (PetscInt) i;
619           AAUp[offset] = (MatScalar)1./v[nz];
620           AiUp[i]      = AiUp[i+1] - (nz+1);
621 
622           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
623           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
624         }
625 
626         /* allocate space for the triangular factor information */
627         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
628         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
629 
630         /* Create the matrix description */
631         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
632         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
633        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
634         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
635        #else
636         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
637        #endif
638         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
639         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
640 
641         /* set the operation */
642         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
643 
644         /* set the matrix */
645         upTriFactor->csrMat = new CsrMatrix;
646         upTriFactor->csrMat->num_rows = n;
647         upTriFactor->csrMat->num_cols = n;
648         upTriFactor->csrMat->num_entries = nzUpper;
649 
650         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
651         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
652 
653         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
654         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
655 
656         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
657         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
658 
659         /* Create the solve analysis information */
660         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
661         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
662       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
663         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
664                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
665                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
666                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
667                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
668         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
669       #endif
670 
671         /* perform the solve analysis */
672         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
673                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
674                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
675                                  upTriFactor->csrMat->column_indices->data().get(),
676                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
677                                  upTriFactor->solveInfo,
678                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
679                                #else
680                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
681                                #endif
682         cerr = WaitForCUDA();CHKERRCUDA(cerr);
683         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
684 
685         /* assign the pointer */
686         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
687         upTriFactor->AA_h = AAUp;
688         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
689         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
690         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
691       } else {
692         if (!upTriFactor->AA_h) {
693           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
694         }
695         /* Fill the upper triangular matrix */
696         offset = nzUpper;
697         for (i=n-1; i>=0; i--) {
698           v  = aa + adiag[i+1] + 1;
699 
700           /* number of elements NOT on the diagonal */
701           nz = adiag[i] - adiag[i+1]-1;
702 
703           /* decrement the offset */
704           offset -= (nz+1);
705 
706           /* first, set the diagonal elements */
707           upTriFactor->AA_h[offset] = 1./v[nz];
708           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
709         }
710         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
711         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
712       }
713     } catch(char *ex) {
714       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
715     }
716   }
717   PetscFunctionReturn(0);
718 }
719 
720 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
721 {
722   PetscErrorCode               ierr;
723   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
724   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
725   IS                           isrow = a->row,iscol = a->icol;
726   PetscBool                    row_identity,col_identity;
727   PetscInt                     n = A->rmap->n;
728 
729   PetscFunctionBegin;
730   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
731   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
732   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
733 
734   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
735   cusparseTriFactors->nnz=a->nz;
736 
737   A->offloadmask = PETSC_OFFLOAD_BOTH;
738   /* lower triangular indices */
739   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
740   if (!row_identity && !cusparseTriFactors->rpermIndices) {
741     const PetscInt *r;
742 
743     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
744     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
745     cusparseTriFactors->rpermIndices->assign(r, r+n);
746     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
747     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
748   }
749 
750   /* upper triangular indices */
751   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
752   if (!col_identity && !cusparseTriFactors->cpermIndices) {
753     const PetscInt *c;
754 
755     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
756     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
757     cusparseTriFactors->cpermIndices->assign(c, c+n);
758     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
759     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
760   }
761   PetscFunctionReturn(0);
762 }
763 
764 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
765 {
766   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
767   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
768   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
769   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
770   cusparseStatus_t                  stat;
771   PetscErrorCode                    ierr;
772   cudaError_t                       cerr;
773   PetscInt                          *AiUp, *AjUp;
774   PetscScalar                       *AAUp;
775   PetscScalar                       *AALo;
776   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
777   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
778   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
779   const MatScalar                   *aa = b->a,*v;
780 
781   PetscFunctionBegin;
782   if (!n) PetscFunctionReturn(0);
783   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
784     try {
785       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
786       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
787       if (!upTriFactor && !loTriFactor) {
788         /* Allocate Space for the upper triangular matrix */
789         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
790         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
791 
792         /* Fill the upper triangular matrix */
793         AiUp[0]=(PetscInt) 0;
794         AiUp[n]=nzUpper;
795         offset = 0;
796         for (i=0; i<n; i++) {
797           /* set the pointers */
798           v  = aa + ai[i];
799           vj = aj + ai[i];
800           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
801 
802           /* first, set the diagonal elements */
803           AjUp[offset] = (PetscInt) i;
804           AAUp[offset] = (MatScalar)1.0/v[nz];
805           AiUp[i]      = offset;
806           AALo[offset] = (MatScalar)1.0/v[nz];
807 
808           offset+=1;
809           if (nz>0) {
810             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
811             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
812             for (j=offset; j<offset+nz; j++) {
813               AAUp[j] = -AAUp[j];
814               AALo[j] = AAUp[j]/v[nz];
815             }
816             offset+=nz;
817           }
818         }
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
822         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the matrix */
836         upTriFactor->csrMat = new CsrMatrix;
837         upTriFactor->csrMat->num_rows = A->rmap->n;
838         upTriFactor->csrMat->num_cols = A->cmap->n;
839         upTriFactor->csrMat->num_entries = a->nz;
840 
841         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
842         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
843 
844         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
845         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
846 
847         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
848         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
849 
850         /* set the operation */
851         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
858                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
859                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
860                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
861                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
867                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
868                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
869                                  upTriFactor->csrMat->column_indices->data().get(),
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  upTriFactor->solveInfo,
872                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873                                 #else
874                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875                                 #endif
876         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878 
879         /* assign the pointer */
880         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
881 
882         /* allocate space for the triangular factor information */
883         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
884         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
885 
886         /* Create the matrix description */
887         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
888         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
889        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
890         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
891        #else
892         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
893        #endif
894         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
895         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
896 
897         /* set the operation */
898         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
899 
900         /* set the matrix */
901         loTriFactor->csrMat = new CsrMatrix;
902         loTriFactor->csrMat->num_rows = A->rmap->n;
903         loTriFactor->csrMat->num_cols = A->cmap->n;
904         loTriFactor->csrMat->num_entries = a->nz;
905 
906         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
908 
909         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
910         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
911 
912         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
913         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914 
915         /* Create the solve analysis information */
916         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
917         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
918       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
919         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
920                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
921                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
922                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
923                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
924         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
925       #endif
926 
927         /* perform the solve analysis */
928         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
929                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
930                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
931                                  loTriFactor->csrMat->column_indices->data().get(),
932                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
933                                  loTriFactor->solveInfo,
934                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
935                                 #else
936                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
937                                 #endif
938         cerr = WaitForCUDA();CHKERRCUDA(cerr);
939         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
940 
941         /* assign the pointer */
942         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
943 
944         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
945         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
946         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
947       } else {
948         /* Fill the upper triangular matrix */
949         offset = 0;
950         for (i=0; i<n; i++) {
951           /* set the pointers */
952           v  = aa + ai[i];
953           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
954 
955           /* first, set the diagonal elements */
956           AAUp[offset] = 1.0/v[nz];
957           AALo[offset] = 1.0/v[nz];
958 
959           offset+=1;
960           if (nz>0) {
961             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
962             for (j=offset; j<offset+nz; j++) {
963               AAUp[j] = -AAUp[j];
964               AALo[j] = AAUp[j]/v[nz];
965             }
966             offset+=nz;
967           }
968         }
969         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
971         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
972         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
973         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
974       }
975       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
976       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
977     } catch(char *ex) {
978       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
979     }
980   }
981   PetscFunctionReturn(0);
982 }
983 
984 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
985 {
986   PetscErrorCode               ierr;
987   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
988   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
989   IS                           ip = a->row;
990   PetscBool                    perm_identity;
991   PetscInt                     n = A->rmap->n;
992 
993   PetscFunctionBegin;
994   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
995   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
996   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
997   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
998 
999   A->offloadmask = PETSC_OFFLOAD_BOTH;
1000 
1001   /* lower triangular indices */
1002   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1003   if (!perm_identity) {
1004     IS             iip;
1005     const PetscInt *irip,*rip;
1006 
1007     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1008     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1009     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1010     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1011     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1012     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1013     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1014     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1015     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1016     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1017     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1018   }
1019   PetscFunctionReturn(0);
1020 }
1021 
1022 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1023 {
1024   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1025   IS             ip = b->row;
1026   PetscBool      perm_identity;
1027   PetscErrorCode ierr;
1028 
1029   PetscFunctionBegin;
1030   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1031   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1032   B->offloadmask = PETSC_OFFLOAD_CPU;
1033   /* determine which version of MatSolve needs to be used. */
1034   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1035   if (perm_identity) {
1036     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1037     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1038     B->ops->matsolve = NULL;
1039     B->ops->matsolvetranspose = NULL;
1040   } else {
1041     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1042     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1043     B->ops->matsolve = NULL;
1044     B->ops->matsolvetranspose = NULL;
1045   }
1046 
1047   /* get the triangular factors */
1048   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1053 {
1054   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1055   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1056   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1057   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1058   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1059   cusparseStatus_t                  stat;
1060   cusparseIndexBase_t               indexBase;
1061   cusparseMatrixType_t              matrixType;
1062   cusparseFillMode_t                fillMode;
1063   cusparseDiagType_t                diagType;
1064   cudaError_t                       cerr;
1065   PetscErrorCode                    ierr;
1066 
1067   PetscFunctionBegin;
1068   /* allocate space for the transpose of the lower triangular factor */
1069   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1070   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1071 
1072   /* set the matrix descriptors of the lower triangular factor */
1073   matrixType = cusparseGetMatType(loTriFactor->descr);
1074   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1075   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1076     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1077   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1078 
1079   /* Create the matrix description */
1080   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1081   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1082   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1083   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1084   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1085 
1086   /* set the operation */
1087   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1088 
1089   /* allocate GPU space for the CSC of the lower triangular factor*/
1090   loTriFactorT->csrMat = new CsrMatrix;
1091   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1092   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1093   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1094   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1095   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1096   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1097 
1098   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1099 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1100   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1101                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1102                                        loTriFactor->csrMat->values->data().get(),
1103                                        loTriFactor->csrMat->row_offsets->data().get(),
1104                                        loTriFactor->csrMat->column_indices->data().get(),
1105                                        loTriFactorT->csrMat->values->data().get(),
1106                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1107                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1108                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1109   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1110 #endif
1111 
1112   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1113   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1114                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1115                           loTriFactor->csrMat->values->data().get(),
1116                           loTriFactor->csrMat->row_offsets->data().get(),
1117                           loTriFactor->csrMat->column_indices->data().get(),
1118                           loTriFactorT->csrMat->values->data().get(),
1119                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1120                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1121                           CUSPARSE_ACTION_NUMERIC, indexBase,
1122                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1123                         #else
1124                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1125                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1126                         #endif
1127   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1128   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1129 
1130   /* Create the solve analysis information */
1131   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1132   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1133 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1134   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1135                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1136                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1137                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1138                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1139   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1140 #endif
1141 
1142   /* perform the solve analysis */
1143   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1144                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1145                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1146                            loTriFactorT->csrMat->column_indices->data().get(),
1147                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1148                            loTriFactorT->solveInfo,
1149                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1150                           #else
1151                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1152                           #endif
1153   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1155 
1156   /* assign the pointer */
1157   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1158 
1159   /*********************************************/
1160   /* Now the Transpose of the Upper Tri Factor */
1161   /*********************************************/
1162 
1163   /* allocate space for the transpose of the upper triangular factor */
1164   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1165   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1166 
1167   /* set the matrix descriptors of the upper triangular factor */
1168   matrixType = cusparseGetMatType(upTriFactor->descr);
1169   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1170   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1171     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1172   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1173 
1174   /* Create the matrix description */
1175   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1176   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1177   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1178   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1179   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1180 
1181   /* set the operation */
1182   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1183 
1184   /* allocate GPU space for the CSC of the upper triangular factor*/
1185   upTriFactorT->csrMat = new CsrMatrix;
1186   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1187   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1188   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1189   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1190   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1191   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1192 
1193   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1194 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1195   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1196                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1197                                 upTriFactor->csrMat->values->data().get(),
1198                                 upTriFactor->csrMat->row_offsets->data().get(),
1199                                 upTriFactor->csrMat->column_indices->data().get(),
1200                                 upTriFactorT->csrMat->values->data().get(),
1201                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1202                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1203                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1204   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1205 #endif
1206 
1207   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1208   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1209                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1210                           upTriFactor->csrMat->values->data().get(),
1211                           upTriFactor->csrMat->row_offsets->data().get(),
1212                           upTriFactor->csrMat->column_indices->data().get(),
1213                           upTriFactorT->csrMat->values->data().get(),
1214                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1215                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1216                           CUSPARSE_ACTION_NUMERIC, indexBase,
1217                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1218                         #else
1219                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1220                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1221                         #endif
1222 
1223   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1224   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1225 
1226   /* Create the solve analysis information */
1227   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1228   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1229   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1230   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1231                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1232                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1233                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1234                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1235   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1236   #endif
1237 
1238   /* perform the solve analysis */
1239   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1240                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1241                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1242                            upTriFactorT->csrMat->column_indices->data().get(),
1243                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1244                            upTriFactorT->solveInfo,
1245                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1246                           #else
1247                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1248                           #endif
1249 
1250   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1251   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1252 
1253   /* assign the pointer */
1254   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1255   PetscFunctionReturn(0);
1256 }
1257 
1258 struct PetscScalarToPetscInt
1259 {
1260   __host__ __device__
1261   PetscInt operator()(PetscScalar s)
1262   {
1263     return (PetscInt)PetscRealPart(s);
1264   }
1265 };
1266 
1267 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1268 {
1269   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1270   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1271   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1272   cusparseStatus_t             stat;
1273   cusparseIndexBase_t          indexBase;
1274   cudaError_t                  err;
1275   PetscErrorCode               ierr;
1276 
1277   PetscFunctionBegin;
1278   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1279   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1280   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1281   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1282   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1283   if (A->transupdated) PetscFunctionReturn(0);
1284   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1285   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1286   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1287     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1288   }
1289   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1290     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1291     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1292     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1293     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1294     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1295 
1296     /* set alpha and beta */
1297     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1298     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1299     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1300     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1301     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1302     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1303 
1304     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1305       CsrMatrix *matrixT = new CsrMatrix;
1306       matstructT->mat = matrixT;
1307       matrixT->num_rows = A->cmap->n;
1308       matrixT->num_cols = A->rmap->n;
1309       matrixT->num_entries = a->nz;
1310       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1311       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1312       matrixT->values = new THRUSTARRAY(a->nz);
1313 
1314       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1315       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1316 
1317      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1318       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1319         stat = cusparseCreateCsr(&matstructT->matDescr,
1320                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1321                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1322                                matrixT->values->data().get(),
1323                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1324                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1325       #else
1326         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1327            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1328 
1329            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1330            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1331            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1332         */
1333         if (matrixT->num_entries) {
1334           stat = cusparseCreateCsr(&matstructT->matDescr,
1335                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1336                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1337                                  matrixT->values->data().get(),
1338                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1339                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1340 
1341         } else {
1342           matstructT->matDescr = NULL;
1343           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1344         }
1345       #endif
1346      #endif
1347     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1348    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1349       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1350    #else
1351       CsrMatrix *temp  = new CsrMatrix;
1352       CsrMatrix *tempT = new CsrMatrix;
1353       /* First convert HYB to CSR */
1354       temp->num_rows = A->rmap->n;
1355       temp->num_cols = A->cmap->n;
1356       temp->num_entries = a->nz;
1357       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1358       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1359       temp->values = new THRUSTARRAY(a->nz);
1360 
1361       stat = cusparse_hyb2csr(cusparsestruct->handle,
1362                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1363                               temp->values->data().get(),
1364                               temp->row_offsets->data().get(),
1365                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1366 
1367       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1368       tempT->num_rows = A->rmap->n;
1369       tempT->num_cols = A->cmap->n;
1370       tempT->num_entries = a->nz;
1371       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1372       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1373       tempT->values = new THRUSTARRAY(a->nz);
1374 
1375       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1376                               temp->num_cols, temp->num_entries,
1377                               temp->values->data().get(),
1378                               temp->row_offsets->data().get(),
1379                               temp->column_indices->data().get(),
1380                               tempT->values->data().get(),
1381                               tempT->column_indices->data().get(),
1382                               tempT->row_offsets->data().get(),
1383                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1384 
1385       /* Last, convert CSC to HYB */
1386       cusparseHybMat_t hybMat;
1387       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1388       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1389         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1390       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1391                               matstructT->descr, tempT->values->data().get(),
1392                               tempT->row_offsets->data().get(),
1393                               tempT->column_indices->data().get(),
1394                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1395 
1396       /* assign the pointer */
1397       matstructT->mat = hybMat;
1398       A->transupdated = PETSC_TRUE;
1399       /* delete temporaries */
1400       if (tempT) {
1401         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1402         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1403         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1404         delete (CsrMatrix*) tempT;
1405       }
1406       if (temp) {
1407         if (temp->values) delete (THRUSTARRAY*) temp->values;
1408         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1409         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1410         delete (CsrMatrix*) temp;
1411       }
1412      #endif
1413     }
1414   }
1415   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1416     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1417     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1418     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1419     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1420     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1421     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1422     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1423     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1424     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1425     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1426     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1427       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1428       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1429       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1430     }
1431     if (!cusparsestruct->csr2csc_i) {
1432       THRUSTARRAY csr2csc_a(matrix->num_entries);
1433       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1434 
1435       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1436      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1437       void   *csr2cscBuffer;
1438       size_t csr2cscBufferSize;
1439       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1440                                            A->cmap->n, matrix->num_entries,
1441                                            matrix->values->data().get(),
1442                                            cusparsestruct->rowoffsets_gpu->data().get(),
1443                                            matrix->column_indices->data().get(),
1444                                            matrixT->values->data().get(),
1445                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1446                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1447                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1448       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1449      #endif
1450 
1451       if (matrix->num_entries) {
1452         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1453            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1454            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1455 
1456            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1457            should be filled with indexBase. So I just take a shortcut here.
1458         */
1459         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1460                               A->cmap->n,matrix->num_entries,
1461                               csr2csc_a.data().get(),
1462                               cusparsestruct->rowoffsets_gpu->data().get(),
1463                               matrix->column_indices->data().get(),
1464                               matrixT->values->data().get(),
1465                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1466                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1467                               CUSPARSE_ACTION_NUMERIC,indexBase,
1468                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1469                              #else
1470                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1471                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1472                              #endif
1473       } else {
1474         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1475       }
1476 
1477       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1478       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1479      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1480       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1481      #endif
1482     }
1483     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1484                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1485                                                      matrixT->values->begin()));
1486   }
1487   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1488   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1489   /* the compressed row indices is not used for matTranspose */
1490   matstructT->cprowIndices = NULL;
1491   /* assign the pointer */
1492   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1493   A->transupdated = PETSC_TRUE;
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1498 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1499 {
1500   PetscInt                              n = xx->map->n;
1501   const PetscScalar                     *barray;
1502   PetscScalar                           *xarray;
1503   thrust::device_ptr<const PetscScalar> bGPU;
1504   thrust::device_ptr<PetscScalar>       xGPU;
1505   cusparseStatus_t                      stat;
1506   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1507   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1508   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1509   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1510   PetscErrorCode                        ierr;
1511 
1512   PetscFunctionBegin;
1513   /* Analyze the matrix and create the transpose ... on the fly */
1514   if (!loTriFactorT && !upTriFactorT) {
1515     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1516     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1517     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1518   }
1519 
1520   /* Get the GPU pointers */
1521   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1523   xGPU = thrust::device_pointer_cast(xarray);
1524   bGPU = thrust::device_pointer_cast(barray);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, reorder with the row permutation */
1528   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1529                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1530                xGPU);
1531 
1532   /* First, solve U */
1533   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1534                         upTriFactorT->csrMat->num_rows,
1535                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1536                         upTriFactorT->csrMat->num_entries,
1537                       #endif
1538                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1539                         upTriFactorT->csrMat->values->data().get(),
1540                         upTriFactorT->csrMat->row_offsets->data().get(),
1541                         upTriFactorT->csrMat->column_indices->data().get(),
1542                         upTriFactorT->solveInfo,
1543                         xarray,
1544                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545                         tempGPU->data().get(),
1546                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1547                       #else
1548                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1549                       #endif
1550 
1551   /* Then, solve L */
1552   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1553                         loTriFactorT->csrMat->num_rows,
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         loTriFactorT->csrMat->num_entries,
1556                       #endif
1557                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1558                         loTriFactorT->csrMat->values->data().get(),
1559                         loTriFactorT->csrMat->row_offsets->data().get(),
1560                         loTriFactorT->csrMat->column_indices->data().get(),
1561                         loTriFactorT->solveInfo,
1562                         tempGPU->data().get(),
1563                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564                         xarray,
1565                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566                       #else
1567                          xarray);CHKERRCUSPARSE(stat);
1568                       #endif
1569 
1570   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1571   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1572                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1573                tempGPU->begin());
1574 
1575   /* Copy the temporary to the full solution. */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1577 
1578   /* restore */
1579   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1580   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1581   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   cusparseStatus_t                  stat;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1594   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595   PetscErrorCode                    ierr;
1596 
1597   PetscFunctionBegin;
1598   /* Analyze the matrix and create the transpose ... on the fly */
1599   if (!loTriFactorT && !upTriFactorT) {
1600     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1601     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1602     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1603   }
1604 
1605   /* Get the GPU pointers */
1606   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1607   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1608 
1609   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1610   /* First, solve U */
1611   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1612                         upTriFactorT->csrMat->num_rows,
1613                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614                         upTriFactorT->csrMat->num_entries,
1615                       #endif
1616                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1617                         upTriFactorT->csrMat->values->data().get(),
1618                         upTriFactorT->csrMat->row_offsets->data().get(),
1619                         upTriFactorT->csrMat->column_indices->data().get(),
1620                         upTriFactorT->solveInfo,
1621                         barray,
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                         tempGPU->data().get(),
1624                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1625                       #else
1626                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1627                       #endif
1628 
1629   /* Then, solve L */
1630   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1631                         loTriFactorT->csrMat->num_rows,
1632                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633                         loTriFactorT->csrMat->num_entries,
1634                       #endif
1635                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1636                         loTriFactorT->csrMat->values->data().get(),
1637                         loTriFactorT->csrMat->row_offsets->data().get(),
1638                         loTriFactorT->csrMat->column_indices->data().get(),
1639                         loTriFactorT->solveInfo,
1640                         tempGPU->data().get(),
1641                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1642                         xarray,
1643                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1644                       #else
1645                         xarray);CHKERRCUSPARSE(stat);
1646                       #endif
1647 
1648   /* restore */
1649   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1650   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1651   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1652   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1657 {
1658   const PetscScalar                     *barray;
1659   PetscScalar                           *xarray;
1660   thrust::device_ptr<const PetscScalar> bGPU;
1661   thrust::device_ptr<PetscScalar>       xGPU;
1662   cusparseStatus_t                      stat;
1663   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1664   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1665   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1666   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1667   PetscErrorCode                        ierr;
1668 
1669   PetscFunctionBegin;
1670 
1671   /* Get the GPU pointers */
1672   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1673   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1674   xGPU = thrust::device_pointer_cast(xarray);
1675   bGPU = thrust::device_pointer_cast(barray);
1676 
1677   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1678   /* First, reorder with the row permutation */
1679   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1680                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1681                tempGPU->begin());
1682 
1683   /* Next, solve L */
1684   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1685                         loTriFactor->csrMat->num_rows,
1686                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1687                         loTriFactor->csrMat->num_entries,
1688                       #endif
1689                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1690                         loTriFactor->csrMat->values->data().get(),
1691                         loTriFactor->csrMat->row_offsets->data().get(),
1692                         loTriFactor->csrMat->column_indices->data().get(),
1693                         loTriFactor->solveInfo,
1694                         tempGPU->data().get(),
1695                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1696                          xarray,
1697                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1698                       #else
1699                          xarray);CHKERRCUSPARSE(stat);
1700                       #endif
1701 
1702   /* Then, solve U */
1703   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1704                         upTriFactor->csrMat->num_rows,
1705                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1706                         upTriFactor->csrMat->num_entries,
1707                       #endif
1708                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1709                         upTriFactor->csrMat->values->data().get(),
1710                         upTriFactor->csrMat->row_offsets->data().get(),
1711                         upTriFactor->csrMat->column_indices->data().get(),
1712                         upTriFactor->solveInfo,xarray,
1713                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1714                         tempGPU->data().get(),
1715                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1716                       #else
1717                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1718                       #endif
1719 
1720   /* Last, reorder with the column permutation */
1721   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1722                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1723                xGPU);
1724 
1725   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1726   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1727   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1728   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1733 {
1734   const PetscScalar                 *barray;
1735   PetscScalar                       *xarray;
1736   cusparseStatus_t                  stat;
1737   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1738   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1739   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1740   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1741   PetscErrorCode                    ierr;
1742 
1743   PetscFunctionBegin;
1744   /* Get the GPU pointers */
1745   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1746   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1747 
1748   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1749   /* First, solve L */
1750   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1751                         loTriFactor->csrMat->num_rows,
1752                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1753                         loTriFactor->csrMat->num_entries,
1754                       #endif
1755                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1756                         loTriFactor->csrMat->values->data().get(),
1757                         loTriFactor->csrMat->row_offsets->data().get(),
1758                         loTriFactor->csrMat->column_indices->data().get(),
1759                         loTriFactor->solveInfo,
1760                         barray,
1761                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1762                         tempGPU->data().get(),
1763                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1764                       #else
1765                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1766                       #endif
1767 
1768   /* Next, solve U */
1769   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1770                         upTriFactor->csrMat->num_rows,
1771                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1772                         upTriFactor->csrMat->num_entries,
1773                       #endif
1774                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1775                         upTriFactor->csrMat->values->data().get(),
1776                         upTriFactor->csrMat->row_offsets->data().get(),
1777                         upTriFactor->csrMat->column_indices->data().get(),
1778                         upTriFactor->solveInfo,
1779                         tempGPU->data().get(),
1780                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1781                         xarray,
1782                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1783                       #else
1784                         xarray);CHKERRCUSPARSE(stat);
1785                       #endif
1786 
1787   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1788   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1789   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1790   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1795 {
1796   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1797   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1798   cudaError_t        cerr;
1799   PetscErrorCode     ierr;
1800 
1801   PetscFunctionBegin;
1802   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1803     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1804 
1805     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1806     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1807     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1808     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1809     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1810     A->offloadmask = PETSC_OFFLOAD_BOTH;
1811   }
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1816 {
1817   PetscErrorCode ierr;
1818 
1819   PetscFunctionBegin;
1820   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1821   *array = ((Mat_SeqAIJ*)A->data)->a;
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1826 {
1827   PetscFunctionBegin;
1828   A->offloadmask = PETSC_OFFLOAD_CPU;
1829   *array         = NULL;
1830   PetscFunctionReturn(0);
1831 }
1832 
1833 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1834 {
1835   PetscErrorCode ierr;
1836 
1837   PetscFunctionBegin;
1838   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1839   *array = ((Mat_SeqAIJ*)A->data)->a;
1840   PetscFunctionReturn(0);
1841 }
1842 
1843 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1844 {
1845   PetscFunctionBegin;
1846   *array = NULL;
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1851 {
1852   PetscFunctionBegin;
1853   *array = ((Mat_SeqAIJ*)A->data)->a;
1854   PetscFunctionReturn(0);
1855 }
1856 
1857 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1858 {
1859   PetscFunctionBegin;
1860   A->offloadmask = PETSC_OFFLOAD_CPU;
1861   *array         = NULL;
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1866 {
1867   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1868   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1870   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1871   PetscErrorCode               ierr;
1872   cusparseStatus_t             stat;
1873   PetscBool                    both = PETSC_TRUE;
1874   cudaError_t                  err;
1875 
1876   PetscFunctionBegin;
1877   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1878   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1879     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1880       CsrMatrix *matrix;
1881       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1882 
1883       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1884       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1885       matrix->values->assign(a->a, a->a+a->nz);
1886       err  = WaitForCUDA();CHKERRCUDA(err);
1887       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1888       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1889       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1890     } else {
1891       PetscInt nnz;
1892       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1893       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1894       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1895       delete cusparsestruct->workVector;
1896       delete cusparsestruct->rowoffsets_gpu;
1897       cusparsestruct->workVector = NULL;
1898       cusparsestruct->rowoffsets_gpu = NULL;
1899       try {
1900         if (a->compressedrow.use) {
1901           m    = a->compressedrow.nrows;
1902           ii   = a->compressedrow.i;
1903           ridx = a->compressedrow.rindex;
1904         } else {
1905           m    = A->rmap->n;
1906           ii   = a->i;
1907           ridx = NULL;
1908         }
1909         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1910         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1911         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1912         else nnz = a->nz;
1913 
1914         /* create cusparse matrix */
1915         cusparsestruct->nrows = m;
1916         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1917         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1918         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1919         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1920 
1921         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1922         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1923         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1924         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1925         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1926         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1927         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1928 
1929         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1930         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1931           /* set the matrix */
1932           CsrMatrix *mat= new CsrMatrix;
1933           mat->num_rows = m;
1934           mat->num_cols = A->cmap->n;
1935           mat->num_entries = nnz;
1936           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1937           mat->row_offsets->assign(ii, ii + m+1);
1938 
1939           mat->column_indices = new THRUSTINTARRAY32(nnz);
1940           mat->column_indices->assign(a->j, a->j+nnz);
1941 
1942           mat->values = new THRUSTARRAY(nnz);
1943           if (a->a) mat->values->assign(a->a, a->a+nnz);
1944 
1945           /* assign the pointer */
1946           matstruct->mat = mat;
1947          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1948           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1949             stat = cusparseCreateCsr(&matstruct->matDescr,
1950                                     mat->num_rows, mat->num_cols, mat->num_entries,
1951                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1952                                     mat->values->data().get(),
1953                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1954                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1955           }
1956          #endif
1957         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1958          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1959           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1960          #else
1961           CsrMatrix *mat= new CsrMatrix;
1962           mat->num_rows = m;
1963           mat->num_cols = A->cmap->n;
1964           mat->num_entries = nnz;
1965           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1966           mat->row_offsets->assign(ii, ii + m+1);
1967 
1968           mat->column_indices = new THRUSTINTARRAY32(nnz);
1969           mat->column_indices->assign(a->j, a->j+nnz);
1970 
1971           mat->values = new THRUSTARRAY(nnz);
1972           if (a->a) mat->values->assign(a->a, a->a+nnz);
1973 
1974           cusparseHybMat_t hybMat;
1975           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1976           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1977             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1978           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1979               matstruct->descr, mat->values->data().get(),
1980               mat->row_offsets->data().get(),
1981               mat->column_indices->data().get(),
1982               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1983           /* assign the pointer */
1984           matstruct->mat = hybMat;
1985 
1986           if (mat) {
1987             if (mat->values) delete (THRUSTARRAY*)mat->values;
1988             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1989             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1990             delete (CsrMatrix*)mat;
1991           }
1992          #endif
1993         }
1994 
1995         /* assign the compressed row indices */
1996         if (a->compressedrow.use) {
1997           cusparsestruct->workVector = new THRUSTARRAY(m);
1998           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1999           matstruct->cprowIndices->assign(ridx,ridx+m);
2000           tmp = m;
2001         } else {
2002           cusparsestruct->workVector = NULL;
2003           matstruct->cprowIndices    = NULL;
2004           tmp = 0;
2005         }
2006         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2007 
2008         /* assign the pointer */
2009         cusparsestruct->mat = matstruct;
2010       } catch(char *ex) {
2011         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2012       }
2013       err  = WaitForCUDA();CHKERRCUDA(err);
2014       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2015       cusparsestruct->nonzerostate = A->nonzerostate;
2016     }
2017     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2018   }
2019   PetscFunctionReturn(0);
2020 }
2021 
2022 struct VecCUDAPlusEquals
2023 {
2024   template <typename Tuple>
2025   __host__ __device__
2026   void operator()(Tuple t)
2027   {
2028     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2029   }
2030 };
2031 
2032 struct VecCUDAEquals
2033 {
2034   template <typename Tuple>
2035   __host__ __device__
2036   void operator()(Tuple t)
2037   {
2038     thrust::get<1>(t) = thrust::get<0>(t);
2039   }
2040 };
2041 
2042 struct VecCUDAEqualsReverse
2043 {
2044   template <typename Tuple>
2045   __host__ __device__
2046   void operator()(Tuple t)
2047   {
2048     thrust::get<0>(t) = thrust::get<1>(t);
2049   }
2050 };
2051 
2052 struct MatMatCusparse {
2053   PetscBool             cisdense;
2054   PetscScalar           *Bt;
2055   Mat                   X;
2056   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2057   PetscLogDouble        flops;
2058   CsrMatrix             *Bcsr;
2059 
2060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2061   cusparseSpMatDescr_t  matSpBDescr;
2062   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2063   cusparseDnMatDescr_t  matBDescr;
2064   cusparseDnMatDescr_t  matCDescr;
2065   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2066  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2067   void                  *dBuffer4;
2068   void                  *dBuffer5;
2069  #endif
2070   size_t                mmBufferSize;
2071   void                  *mmBuffer;
2072   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2073   cusparseSpGEMMDescr_t spgemmDesc;
2074 #endif
2075 };
2076 
2077 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2078 {
2079   PetscErrorCode   ierr;
2080   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2081   cudaError_t      cerr;
2082  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2083   cusparseStatus_t stat;
2084  #endif
2085 
2086   PetscFunctionBegin;
2087   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2088   delete mmdata->Bcsr;
2089  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2090   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2091   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2092   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2093   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2094  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2095   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2096   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2097  #endif
2098   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2099   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2100  #endif
2101   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2102   ierr = PetscFree(data);CHKERRQ(ierr);
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2107 
2108 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2109 {
2110   Mat_Product                  *product = C->product;
2111   Mat                          A,B;
2112   PetscInt                     m,n,blda,clda;
2113   PetscBool                    flg,biscuda;
2114   Mat_SeqAIJCUSPARSE           *cusp;
2115   cusparseStatus_t             stat;
2116   cusparseOperation_t          opA;
2117   const PetscScalar            *barray;
2118   PetscScalar                  *carray;
2119   PetscErrorCode               ierr;
2120   MatMatCusparse               *mmdata;
2121   Mat_SeqAIJCUSPARSEMultStruct *mat;
2122   CsrMatrix                    *csrmat;
2123 
2124   PetscFunctionBegin;
2125   MatCheckProduct(C,1);
2126   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2127   mmdata = (MatMatCusparse*)product->data;
2128   A    = product->A;
2129   B    = product->B;
2130   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2131   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2132   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2133      Instead of silently accepting the wrong answer, I prefer to raise the error */
2134   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2135   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2136   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2137   switch (product->type) {
2138   case MATPRODUCT_AB:
2139   case MATPRODUCT_PtAP:
2140     mat = cusp->mat;
2141     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2142     m   = A->rmap->n;
2143     n   = B->cmap->n;
2144     break;
2145   case MATPRODUCT_AtB:
2146     if (!A->form_explicit_transpose) {
2147       mat = cusp->mat;
2148       opA = CUSPARSE_OPERATION_TRANSPOSE;
2149     } else {
2150       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2151       mat  = cusp->matTranspose;
2152       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2153     }
2154     m = A->cmap->n;
2155     n = B->cmap->n;
2156     break;
2157   case MATPRODUCT_ABt:
2158   case MATPRODUCT_RARt:
2159     mat = cusp->mat;
2160     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2161     m   = A->rmap->n;
2162     n   = B->rmap->n;
2163     break;
2164   default:
2165     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2166   }
2167   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2168   csrmat = (CsrMatrix*)mat->mat;
2169   /* if the user passed a CPU matrix, copy the data to the GPU */
2170   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2171   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2172   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2173 
2174   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2175   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2176     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2177     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2178   } else {
2179     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2180     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2181   }
2182 
2183   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2184  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2185   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2186   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2187   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2188     size_t mmBufferSize;
2189     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2190     if (!mmdata->matBDescr) {
2191       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2192       mmdata->Blda = blda;
2193     }
2194 
2195     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2196     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2197       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2198       mmdata->Clda = clda;
2199     }
2200 
2201     if (!mat->matDescr) {
2202       stat = cusparseCreateCsr(&mat->matDescr,
2203                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2204                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2205                                csrmat->values->data().get(),
2206                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2207                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2208     }
2209     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2210                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2211                                    mmdata->matCDescr,cusparse_scalartype,
2212                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2213     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2214       cudaError_t cerr;
2215       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2216       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2217       mmdata->mmBufferSize = mmBufferSize;
2218     }
2219     mmdata->initialized = PETSC_TRUE;
2220   } else {
2221     /* to be safe, always update pointers of the mats */
2222     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2223     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2224     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2225   }
2226 
2227   /* do cusparseSpMM, which supports transpose on B */
2228   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2229                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2230                       mmdata->matCDescr,cusparse_scalartype,
2231                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2232  #else
2233   PetscInt k;
2234   /* cusparseXcsrmm does not support transpose on B */
2235   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2236     cublasHandle_t cublasv2handle;
2237     cublasStatus_t cerr;
2238 
2239     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2240     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2241                        B->cmap->n,B->rmap->n,
2242                        &PETSC_CUSPARSE_ONE ,barray,blda,
2243                        &PETSC_CUSPARSE_ZERO,barray,blda,
2244                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2245     blda = B->cmap->n;
2246     k    = B->cmap->n;
2247   } else {
2248     k    = B->rmap->n;
2249   }
2250 
2251   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2252   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2253                            csrmat->num_entries,mat->alpha_one,mat->descr,
2254                            csrmat->values->data().get(),
2255                            csrmat->row_offsets->data().get(),
2256                            csrmat->column_indices->data().get(),
2257                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2258                            carray,clda);CHKERRCUSPARSE(stat);
2259  #endif
2260   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2261   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2262   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2263   if (product->type == MATPRODUCT_RARt) {
2264     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2265     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2266   } else if (product->type == MATPRODUCT_PtAP) {
2267     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2268     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2269   } else {
2270     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2271   }
2272   if (mmdata->cisdense) {
2273     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2274   }
2275   if (!biscuda) {
2276     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2277   }
2278   PetscFunctionReturn(0);
2279 }
2280 
2281 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2282 {
2283   Mat_Product        *product = C->product;
2284   Mat                A,B;
2285   PetscInt           m,n;
2286   PetscBool          cisdense,flg;
2287   PetscErrorCode     ierr;
2288   MatMatCusparse     *mmdata;
2289   Mat_SeqAIJCUSPARSE *cusp;
2290 
2291   PetscFunctionBegin;
2292   MatCheckProduct(C,1);
2293   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2294   A    = product->A;
2295   B    = product->B;
2296   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2297   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2299   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2300   switch (product->type) {
2301   case MATPRODUCT_AB:
2302     m = A->rmap->n;
2303     n = B->cmap->n;
2304     break;
2305   case MATPRODUCT_AtB:
2306     m = A->cmap->n;
2307     n = B->cmap->n;
2308     break;
2309   case MATPRODUCT_ABt:
2310     m = A->rmap->n;
2311     n = B->rmap->n;
2312     break;
2313   case MATPRODUCT_PtAP:
2314     m = B->cmap->n;
2315     n = B->cmap->n;
2316     break;
2317   case MATPRODUCT_RARt:
2318     m = B->rmap->n;
2319     n = B->rmap->n;
2320     break;
2321   default:
2322     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2323   }
2324   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2325   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2326   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2327   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2328 
2329   /* product data */
2330   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2331   mmdata->cisdense = cisdense;
2332  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2333   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2334   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2335     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2336   }
2337  #endif
2338   /* for these products we need intermediate storage */
2339   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2340     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2341     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2342     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2343       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2344     } else {
2345       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2346     }
2347   }
2348   C->product->data    = mmdata;
2349   C->product->destroy = MatDestroy_MatMatCusparse;
2350 
2351   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2352   PetscFunctionReturn(0);
2353 }
2354 
2355 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2356 {
2357   Mat_Product                  *product = C->product;
2358   Mat                          A,B;
2359   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2360   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2361   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2362   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2363   PetscBool                    flg;
2364   PetscErrorCode               ierr;
2365   cusparseStatus_t             stat;
2366   cudaError_t                  cerr;
2367   MatProductType               ptype;
2368   MatMatCusparse               *mmdata;
2369 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2370   cusparseSpMatDescr_t         BmatSpDescr;
2371 #endif
2372   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2373 
2374   PetscFunctionBegin;
2375   MatCheckProduct(C,1);
2376   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2377   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2378   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2379   mmdata = (MatMatCusparse*)C->product->data;
2380   A = product->A;
2381   B = product->B;
2382   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2383     mmdata->reusesym = PETSC_FALSE;
2384     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2385     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2386     Cmat = Ccusp->mat;
2387     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2388     Ccsr = (CsrMatrix*)Cmat->mat;
2389     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2390     goto finalize;
2391   }
2392   if (!c->nz) goto finalize;
2393   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2394   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2395   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2396   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2397   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2398   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2399   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2400   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2401   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2402   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2404   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2405   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2406   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2407 
2408   ptype = product->type;
2409   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2410     ptype = MATPRODUCT_AB;
2411     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2412   }
2413   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2414     ptype = MATPRODUCT_AB;
2415     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2416   }
2417   switch (ptype) {
2418   case MATPRODUCT_AB:
2419     Amat = Acusp->mat;
2420     Bmat = Bcusp->mat;
2421     break;
2422   case MATPRODUCT_AtB:
2423     Amat = Acusp->matTranspose;
2424     Bmat = Bcusp->mat;
2425     break;
2426   case MATPRODUCT_ABt:
2427     Amat = Acusp->mat;
2428     Bmat = Bcusp->matTranspose;
2429     break;
2430   default:
2431     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2432   }
2433   Cmat = Ccusp->mat;
2434   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2435   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2436   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2437   Acsr = (CsrMatrix*)Amat->mat;
2438   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2439   Ccsr = (CsrMatrix*)Cmat->mat;
2440   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2441   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2442   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2443   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2444 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2445   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2446   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2447   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2448     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2449                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2450                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2451                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2452   #else
2453     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2454                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2455                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2456                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2457     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2458                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2459                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2460   #endif
2461 #else
2462   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2463                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2464                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2465                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2466                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2467 #endif
2468   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2469   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2470   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2471   C->offloadmask = PETSC_OFFLOAD_GPU;
2472 finalize:
2473   /* shorter version of MatAssemblyEnd_SeqAIJ */
2474   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2475   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2476   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2477   c->reallocs         = 0;
2478   C->info.mallocs    += 0;
2479   C->info.nz_unneeded = 0;
2480   C->assembled = C->was_assembled = PETSC_TRUE;
2481   C->num_ass++;
2482   PetscFunctionReturn(0);
2483 }
2484 
2485 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2486 {
2487   Mat_Product                  *product = C->product;
2488   Mat                          A,B;
2489   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2490   Mat_SeqAIJ                   *a,*b,*c;
2491   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2492   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2493   PetscInt                     i,j,m,n,k;
2494   PetscBool                    flg;
2495   PetscErrorCode               ierr;
2496   cusparseStatus_t             stat;
2497   cudaError_t                  cerr;
2498   MatProductType               ptype;
2499   MatMatCusparse               *mmdata;
2500   PetscLogDouble               flops;
2501   PetscBool                    biscompressed,ciscompressed;
2502 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2503   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2504   cusparseSpMatDescr_t         BmatSpDescr;
2505 #else
2506   int                          cnz;
2507 #endif
2508   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2509 
2510   PetscFunctionBegin;
2511   MatCheckProduct(C,1);
2512   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2513   A    = product->A;
2514   B    = product->B;
2515   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2516   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2517   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2518   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2519   a = (Mat_SeqAIJ*)A->data;
2520   b = (Mat_SeqAIJ*)B->data;
2521   /* product data */
2522   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2523   C->product->data    = mmdata;
2524   C->product->destroy = MatDestroy_MatMatCusparse;
2525 
2526   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2527   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2528   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2529   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2530   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2531   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2532 
2533   ptype = product->type;
2534   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2535     ptype = MATPRODUCT_AB;
2536     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2537   }
2538   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2539     ptype = MATPRODUCT_AB;
2540     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2541   }
2542   biscompressed = PETSC_FALSE;
2543   ciscompressed = PETSC_FALSE;
2544   switch (ptype) {
2545   case MATPRODUCT_AB:
2546     m = A->rmap->n;
2547     n = B->cmap->n;
2548     k = A->cmap->n;
2549     Amat = Acusp->mat;
2550     Bmat = Bcusp->mat;
2551     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2552     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2553     break;
2554   case MATPRODUCT_AtB:
2555     m = A->cmap->n;
2556     n = B->cmap->n;
2557     k = A->rmap->n;
2558     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2559     Amat = Acusp->matTranspose;
2560     Bmat = Bcusp->mat;
2561     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2562     break;
2563   case MATPRODUCT_ABt:
2564     m = A->rmap->n;
2565     n = B->rmap->n;
2566     k = A->cmap->n;
2567     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2568     Amat = Acusp->mat;
2569     Bmat = Bcusp->matTranspose;
2570     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2571     break;
2572   default:
2573     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2574   }
2575 
2576   /* create cusparse matrix */
2577   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2578   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2579   c     = (Mat_SeqAIJ*)C->data;
2580   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2581   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2582   Ccsr  = new CsrMatrix;
2583 
2584   c->compressedrow.use = ciscompressed;
2585   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2586     c->compressedrow.nrows = a->compressedrow.nrows;
2587     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2588     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2589     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2590     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2591     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2592   } else {
2593     c->compressedrow.nrows  = 0;
2594     c->compressedrow.i      = NULL;
2595     c->compressedrow.rindex = NULL;
2596     Ccusp->workVector       = NULL;
2597     Cmat->cprowIndices      = NULL;
2598   }
2599   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2600   Ccusp->mat      = Cmat;
2601   Ccusp->mat->mat = Ccsr;
2602   Ccsr->num_rows    = Ccusp->nrows;
2603   Ccsr->num_cols    = n;
2604   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2605   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2606   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2607   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2608   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2609   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2610   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2611   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2613   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2614   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2615     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2616     c->nz = 0;
2617     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2618     Ccsr->values = new THRUSTARRAY(c->nz);
2619     goto finalizesym;
2620   }
2621 
2622   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2623   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2624   Acsr = (CsrMatrix*)Amat->mat;
2625   if (!biscompressed) {
2626     Bcsr = (CsrMatrix*)Bmat->mat;
2627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2628     BmatSpDescr = Bmat->matDescr;
2629 #endif
2630   } else { /* we need to use row offsets for the full matrix */
2631     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2632     Bcsr = new CsrMatrix;
2633     Bcsr->num_rows       = B->rmap->n;
2634     Bcsr->num_cols       = cBcsr->num_cols;
2635     Bcsr->num_entries    = cBcsr->num_entries;
2636     Bcsr->column_indices = cBcsr->column_indices;
2637     Bcsr->values         = cBcsr->values;
2638     if (!Bcusp->rowoffsets_gpu) {
2639       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2640       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2641       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2642     }
2643     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2644     mmdata->Bcsr = Bcsr;
2645 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2646     if (Bcsr->num_rows && Bcsr->num_cols) {
2647       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2648                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2649                                Bcsr->values->data().get(),
2650                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2651                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2652     }
2653     BmatSpDescr = mmdata->matSpBDescr;
2654 #endif
2655   }
2656   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2657   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2658   /* precompute flops count */
2659   if (ptype == MATPRODUCT_AB) {
2660     for (i=0, flops = 0; i<A->rmap->n; i++) {
2661       const PetscInt st = a->i[i];
2662       const PetscInt en = a->i[i+1];
2663       for (j=st; j<en; j++) {
2664         const PetscInt brow = a->j[j];
2665         flops += 2.*(b->i[brow+1] - b->i[brow]);
2666       }
2667     }
2668   } else if (ptype == MATPRODUCT_AtB) {
2669     for (i=0, flops = 0; i<A->rmap->n; i++) {
2670       const PetscInt anzi = a->i[i+1] - a->i[i];
2671       const PetscInt bnzi = b->i[i+1] - b->i[i];
2672       flops += (2.*anzi)*bnzi;
2673     }
2674   } else { /* TODO */
2675     flops = 0.;
2676   }
2677 
2678   mmdata->flops = flops;
2679   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2680 
2681 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2682   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2683   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2684                           NULL, NULL, NULL,
2685                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2686                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2687   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2688  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2689  {
2690   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2691      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2692   */
2693   void*  dBuffer1 = NULL;
2694   void*  dBuffer2 = NULL;
2695   void*  dBuffer3 = NULL;
2696   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2697   size_t bufferSize1 = 0;
2698   size_t bufferSize2 = 0;
2699   size_t bufferSize3 = 0;
2700   size_t bufferSize4 = 0;
2701   size_t bufferSize5 = 0;
2702 
2703   /*----------------------------------------------------------------------*/
2704   /* ask bufferSize1 bytes for external memory */
2705   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2706                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2707                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2708   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2709   /* inspect the matrices A and B to understand the memory requirement for the next step */
2710   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2711                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2712                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2713 
2714   /*----------------------------------------------------------------------*/
2715   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2716                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2717                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2718   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2719   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2720   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2721   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2722                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2723                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2724   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2725   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2726 
2727   /*----------------------------------------------------------------------*/
2728   /* get matrix C non-zero entries C_nnz1 */
2729   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2730   c->nz = (PetscInt) C_nnz1;
2731   /* allocate matrix C */
2732   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2733   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734   /* update matC with the new pointers */
2735   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2736                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2737 
2738   /*----------------------------------------------------------------------*/
2739   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2740                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2741                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2742   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2743   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2744                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2745                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2746   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2747   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2748                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2749                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2750                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2751   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2752  }
2753  #else
2754   size_t bufSize2;
2755   /* ask bufferSize bytes for external memory */
2756   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2757                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2758                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2759                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2760   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2761   /* inspect the matrices A and B to understand the memory requirement for the next step */
2762   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2763                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2764                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2765                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2766   /* ask bufferSize again bytes for external memory */
2767   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2768                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2769                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2770                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2771   /* The CUSPARSE documentation is not clear, nor the API
2772      We need both buffers to perform the operations properly!
2773      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2774      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2775      is stored in the descriptor! What a messy API... */
2776   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2777   /* compute the intermediate product of A * B */
2778   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2779                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2780                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2781                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2782   /* get matrix C non-zero entries C_nnz1 */
2783   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2784   c->nz = (PetscInt) C_nnz1;
2785   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2786   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2787   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2788   Ccsr->values = new THRUSTARRAY(c->nz);
2789   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2790   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2791                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2792   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2793                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2794                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2795  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2796 #else
2797   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2798   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2799                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2800                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2801                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2802                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2803   c->nz = cnz;
2804   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2805   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2806   Ccsr->values = new THRUSTARRAY(c->nz);
2807   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2808 
2809   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2810   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2811      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2812      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2813   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2814                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2815                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2816                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2817                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2818 #endif
2819   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2820   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2821 finalizesym:
2822   c->singlemalloc = PETSC_FALSE;
2823   c->free_a       = PETSC_TRUE;
2824   c->free_ij      = PETSC_TRUE;
2825   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2826   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2827   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2828     PetscInt *d_i = c->i;
2829     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2830     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2831     ii   = *Ccsr->row_offsets;
2832     jj   = *Ccsr->column_indices;
2833     if (ciscompressed) d_i = c->compressedrow.i;
2834     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2835     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2836   } else {
2837     PetscInt *d_i = c->i;
2838     if (ciscompressed) d_i = c->compressedrow.i;
2839     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2840     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2841   }
2842   if (ciscompressed) { /* need to expand host row offsets */
2843     PetscInt r = 0;
2844     c->i[0] = 0;
2845     for (k = 0; k < c->compressedrow.nrows; k++) {
2846       const PetscInt next = c->compressedrow.rindex[k];
2847       const PetscInt old = c->compressedrow.i[k];
2848       for (; r < next; r++) c->i[r+1] = old;
2849     }
2850     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2851   }
2852   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2853   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2854   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2855   c->maxnz = c->nz;
2856   c->nonzerorowcnt = 0;
2857   c->rmax = 0;
2858   for (k = 0; k < m; k++) {
2859     const PetscInt nn = c->i[k+1] - c->i[k];
2860     c->ilen[k] = c->imax[k] = nn;
2861     c->nonzerorowcnt += (PetscInt)!!nn;
2862     c->rmax = PetscMax(c->rmax,nn);
2863   }
2864   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2865   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2866   Ccsr->num_entries = c->nz;
2867 
2868   C->nonzerostate++;
2869   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2870   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2871   Ccusp->nonzerostate = C->nonzerostate;
2872   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2873   C->preallocated  = PETSC_TRUE;
2874   C->assembled     = PETSC_FALSE;
2875   C->was_assembled = PETSC_FALSE;
2876   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2877     mmdata->reusesym = PETSC_TRUE;
2878     C->offloadmask   = PETSC_OFFLOAD_GPU;
2879   }
2880   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2881   PetscFunctionReturn(0);
2882 }
2883 
2884 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2885 
2886 /* handles sparse or dense B */
2887 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2888 {
2889   Mat_Product    *product = mat->product;
2890   PetscErrorCode ierr;
2891   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2892 
2893   PetscFunctionBegin;
2894   MatCheckProduct(mat,1);
2895   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2896   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2897     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2898   }
2899   if (product->type == MATPRODUCT_ABC) {
2900     Ciscusp = PETSC_FALSE;
2901     if (!product->C->boundtocpu) {
2902       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2903     }
2904   }
2905   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2906     PetscBool usecpu = PETSC_FALSE;
2907     switch (product->type) {
2908     case MATPRODUCT_AB:
2909       if (product->api_user) {
2910         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2911         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2912         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2913       } else {
2914         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2915         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2916         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2917       }
2918       break;
2919     case MATPRODUCT_AtB:
2920       if (product->api_user) {
2921         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2922         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2923         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2924       } else {
2925         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2926         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2927         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2928       }
2929       break;
2930     case MATPRODUCT_PtAP:
2931       if (product->api_user) {
2932         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2933         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2934         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2935       } else {
2936         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2937         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2938         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2939       }
2940       break;
2941     case MATPRODUCT_RARt:
2942       if (product->api_user) {
2943         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2944         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2945         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2946       } else {
2947         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2948         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2949         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2950       }
2951       break;
2952     case MATPRODUCT_ABC:
2953       if (product->api_user) {
2954         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2955         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2956         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2957       } else {
2958         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2959         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2960         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2961       }
2962       break;
2963     default:
2964       break;
2965     }
2966     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2967   }
2968   /* dispatch */
2969   if (isdense) {
2970     switch (product->type) {
2971     case MATPRODUCT_AB:
2972     case MATPRODUCT_AtB:
2973     case MATPRODUCT_ABt:
2974     case MATPRODUCT_PtAP:
2975     case MATPRODUCT_RARt:
2976      if (product->A->boundtocpu) {
2977         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2978       } else {
2979         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2980       }
2981       break;
2982     case MATPRODUCT_ABC:
2983       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2984       break;
2985     default:
2986       break;
2987     }
2988   } else if (Biscusp && Ciscusp) {
2989     switch (product->type) {
2990     case MATPRODUCT_AB:
2991     case MATPRODUCT_AtB:
2992     case MATPRODUCT_ABt:
2993       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2994       break;
2995     case MATPRODUCT_PtAP:
2996     case MATPRODUCT_RARt:
2997     case MATPRODUCT_ABC:
2998       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2999       break;
3000     default:
3001       break;
3002     }
3003   } else { /* fallback for AIJ */
3004     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3005   }
3006   PetscFunctionReturn(0);
3007 }
3008 
3009 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3010 {
3011   PetscErrorCode ierr;
3012 
3013   PetscFunctionBegin;
3014   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3015   PetscFunctionReturn(0);
3016 }
3017 
3018 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3019 {
3020   PetscErrorCode ierr;
3021 
3022   PetscFunctionBegin;
3023   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3024   PetscFunctionReturn(0);
3025 }
3026 
3027 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3028 {
3029   PetscErrorCode ierr;
3030 
3031   PetscFunctionBegin;
3032   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3033   PetscFunctionReturn(0);
3034 }
3035 
3036 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3037 {
3038   PetscErrorCode ierr;
3039 
3040   PetscFunctionBegin;
3041   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3042   PetscFunctionReturn(0);
3043 }
3044 
3045 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3046 {
3047   PetscErrorCode ierr;
3048 
3049   PetscFunctionBegin;
3050   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3051   PetscFunctionReturn(0);
3052 }
3053 
3054 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3055 {
3056   int i = blockIdx.x*blockDim.x + threadIdx.x;
3057   if (i < n) y[idx[i]] += x[i];
3058 }
3059 
3060 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3061 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3062 {
3063   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3064   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3065   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3066   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3067   PetscErrorCode               ierr;
3068   cusparseStatus_t             stat;
3069   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3070   PetscBool                    compressed;
3071 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3072   PetscInt                     nx,ny;
3073 #endif
3074 
3075   PetscFunctionBegin;
3076   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3077   if (!a->nonzerorowcnt) {
3078     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3079     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3080     PetscFunctionReturn(0);
3081   }
3082   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3083   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3084   if (!trans) {
3085     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3086     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3087   } else {
3088     if (herm || !A->form_explicit_transpose) {
3089       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3090       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3091     } else {
3092       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3093       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3094     }
3095   }
3096   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3097   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3098 
3099   try {
3100     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3101     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3102     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3103 
3104     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3105     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3106       /* z = A x + beta y.
3107          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3108          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3109       */
3110       xptr = xarray;
3111       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3112       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3113      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3115           allocated to accommodate different uses. So we get the length info directly from mat.
3116        */
3117       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3118         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3119         nx = mat->num_cols;
3120         ny = mat->num_rows;
3121       }
3122      #endif
3123     } else {
3124       /* z = A^T x + beta y
3125          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3126          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3127        */
3128       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3129       dptr = zarray;
3130       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3131       if (compressed) { /* Scatter x to work vector */
3132         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3133         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3134                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3135                          VecCUDAEqualsReverse());
3136       }
3137      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3138       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3139         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3140         nx = mat->num_rows;
3141         ny = mat->num_cols;
3142       }
3143      #endif
3144     }
3145 
3146     /* csr_spmv does y = alpha op(A) x + beta y */
3147     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3148      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3149       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3150       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3151         cudaError_t cerr;
3152         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3153         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3154         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3155                                 matstruct->matDescr,
3156                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3157                                 matstruct->cuSpMV[opA].vecYDescr,
3158                                 cusparse_scalartype,
3159                                 cusparsestruct->spmvAlg,
3160                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3161         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3162 
3163         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3164       } else {
3165         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3166         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3167         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3168       }
3169 
3170       stat = cusparseSpMV(cusparsestruct->handle, opA,
3171                                matstruct->alpha_one,
3172                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3173                                matstruct->cuSpMV[opA].vecXDescr,
3174                                beta,
3175                                matstruct->cuSpMV[opA].vecYDescr,
3176                                cusparse_scalartype,
3177                                cusparsestruct->spmvAlg,
3178                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3179      #else
3180       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3181       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3182                                mat->num_rows, mat->num_cols,
3183                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3184                                mat->values->data().get(), mat->row_offsets->data().get(),
3185                                mat->column_indices->data().get(), xptr, beta,
3186                                dptr);CHKERRCUSPARSE(stat);
3187      #endif
3188     } else {
3189       if (cusparsestruct->nrows) {
3190        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3191         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3192        #else
3193         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3194         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3195                                  matstruct->alpha_one, matstruct->descr, hybMat,
3196                                  xptr, beta,
3197                                  dptr);CHKERRCUSPARSE(stat);
3198        #endif
3199       }
3200     }
3201     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3202 
3203     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3204       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3205         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3206           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3207         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3208           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3209         }
3210       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3211         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3212       }
3213 
3214       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3215       if (compressed) {
3216         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3217         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3218            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3219            prevent that. So I just add a ScatterAdd kernel.
3220          */
3221        #if 0
3222         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3223         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3224                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3225                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3226                          VecCUDAPlusEquals());
3227        #else
3228         PetscInt n = matstruct->cprowIndices->size();
3229         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3230        #endif
3231         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3232       }
3233     } else {
3234       if (yy && yy != zz) {
3235         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3236       }
3237     }
3238     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3239     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3240     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3241   } catch(char *ex) {
3242     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3243   }
3244   if (yy) {
3245     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3246   } else {
3247     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3248   }
3249   PetscFunctionReturn(0);
3250 }
3251 
3252 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3253 {
3254   PetscErrorCode ierr;
3255 
3256   PetscFunctionBegin;
3257   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3258   PetscFunctionReturn(0);
3259 }
3260 
3261 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3262 {
3263   PetscErrorCode     ierr;
3264   PetscObjectState   onnz = A->nonzerostate;
3265   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3266 
3267   PetscFunctionBegin;
3268   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3269   if (onnz != A->nonzerostate && cusp->deviceMat) {
3270     cudaError_t cerr;
3271 
3272     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3273     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3274     cusp->deviceMat = NULL;
3275   }
3276   PetscFunctionReturn(0);
3277 }
3278 
3279 /* --------------------------------------------------------------------------------*/
3280 /*@
3281    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3282    (the default parallel PETSc format). This matrix will ultimately pushed down
3283    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3284    assembly performance the user should preallocate the matrix storage by setting
3285    the parameter nz (or the array nnz).  By setting these parameters accurately,
3286    performance during matrix assembly can be increased by more than a factor of 50.
3287 
3288    Collective
3289 
3290    Input Parameters:
3291 +  comm - MPI communicator, set to PETSC_COMM_SELF
3292 .  m - number of rows
3293 .  n - number of columns
3294 .  nz - number of nonzeros per row (same for all rows)
3295 -  nnz - array containing the number of nonzeros in the various rows
3296          (possibly different for each row) or NULL
3297 
3298    Output Parameter:
3299 .  A - the matrix
3300 
3301    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3302    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3303    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3304 
3305    Notes:
3306    If nnz is given then nz is ignored
3307 
3308    The AIJ format (also called the Yale sparse matrix format or
3309    compressed row storage), is fully compatible with standard Fortran 77
3310    storage.  That is, the stored row and column indices can begin at
3311    either one (as in Fortran) or zero.  See the users' manual for details.
3312 
3313    Specify the preallocated storage with either nz or nnz (not both).
3314    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3315    allocation.  For large problems you MUST preallocate memory or you
3316    will get TERRIBLE performance, see the users' manual chapter on matrices.
3317 
3318    By default, this format uses inodes (identical nodes) when possible, to
3319    improve numerical efficiency of matrix-vector products and solves. We
3320    search for consecutive rows with the same nonzero structure, thereby
3321    reusing matrix information to achieve increased efficiency.
3322 
3323    Level: intermediate
3324 
3325 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3326 @*/
3327 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3328 {
3329   PetscErrorCode ierr;
3330 
3331   PetscFunctionBegin;
3332   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3333   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3334   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3335   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3336   PetscFunctionReturn(0);
3337 }
3338 
3339 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3340 {
3341   PetscErrorCode ierr;
3342 
3343   PetscFunctionBegin;
3344   if (A->factortype == MAT_FACTOR_NONE) {
3345     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3346   } else {
3347     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3348   }
3349   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3350   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3351   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3357   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3358   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3359   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3360   PetscFunctionReturn(0);
3361 }
3362 
3363 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3364 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3365 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3366 {
3367   PetscErrorCode ierr;
3368 
3369   PetscFunctionBegin;
3370   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3371   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3372   PetscFunctionReturn(0);
3373 }
3374 
3375 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3376 {
3377   PetscErrorCode     ierr;
3378   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3379   Mat_SeqAIJCUSPARSE *cy;
3380   Mat_SeqAIJCUSPARSE *cx;
3381   PetscScalar        *ay;
3382   const PetscScalar  *ax;
3383   CsrMatrix          *csry,*csrx;
3384 
3385   PetscFunctionBegin;
3386   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3387   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3388   if (X->ops->axpy != Y->ops->axpy) {
3389     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3390     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3391     PetscFunctionReturn(0);
3392   }
3393   /* if we are here, it means both matrices are bound to GPU */
3394   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3395   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3396   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3397   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3398   csry = (CsrMatrix*)cy->mat->mat;
3399   csrx = (CsrMatrix*)cx->mat->mat;
3400   /* see if we can turn this into a cublas axpy */
3401   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3402     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3403     if (eq) {
3404       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3405     }
3406     if (eq) str = SAME_NONZERO_PATTERN;
3407   }
3408   /* spgeam is buggy with one column */
3409   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3410 
3411   if (str == SUBSET_NONZERO_PATTERN) {
3412     cusparseStatus_t stat;
3413     PetscScalar      b = 1.0;
3414 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3415     size_t           bufferSize;
3416     void             *buffer;
3417     cudaError_t      cerr;
3418 #endif
3419 
3420     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3421     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3422     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3423 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3424     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3425                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3426                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3427                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3428     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3429     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3430     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3431                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3432                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3433                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3434     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3435     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3436     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3437 #else
3438     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3439     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3440                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3441                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3442                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3443     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3444     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3445 #endif
3446     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3447     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3448     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3449     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3450   } else if (str == SAME_NONZERO_PATTERN) {
3451     cublasHandle_t cublasv2handle;
3452     cublasStatus_t berr;
3453     PetscBLASInt   one = 1, bnz = 1;
3454 
3455     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3456     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3457     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3458     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3459     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3460     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3461     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3462     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3463     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3464     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3465     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3466   } else {
3467     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3468     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3469   }
3470   PetscFunctionReturn(0);
3471 }
3472 
3473 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3474 {
3475   PetscErrorCode ierr;
3476   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3477   PetscScalar    *ay;
3478   cublasHandle_t cublasv2handle;
3479   cublasStatus_t berr;
3480   PetscBLASInt   one = 1, bnz = 1;
3481 
3482   PetscFunctionBegin;
3483   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3484   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3485   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3486   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3487   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3488   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3489   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3490   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3491   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3492   PetscFunctionReturn(0);
3493 }
3494 
3495 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3496 {
3497   PetscErrorCode ierr;
3498   PetscBool      both = PETSC_FALSE;
3499   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3500 
3501   PetscFunctionBegin;
3502   if (A->factortype == MAT_FACTOR_NONE) {
3503     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3504     if (spptr->mat) {
3505       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3506       if (matrix->values) {
3507         both = PETSC_TRUE;
3508         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3509       }
3510     }
3511     if (spptr->matTranspose) {
3512       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3513       if (matrix->values) {
3514         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3515       }
3516     }
3517   }
3518   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3519   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3520   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3521   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3522   else A->offloadmask = PETSC_OFFLOAD_CPU;
3523   PetscFunctionReturn(0);
3524 }
3525 
3526 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3527 {
3528   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3529   PetscErrorCode ierr;
3530 
3531   PetscFunctionBegin;
3532   if (A->factortype != MAT_FACTOR_NONE) {
3533     A->boundtocpu = flg;
3534     PetscFunctionReturn(0);
3535   }
3536   if (flg) {
3537     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3538 
3539     A->ops->scale                     = MatScale_SeqAIJ;
3540     A->ops->axpy                      = MatAXPY_SeqAIJ;
3541     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3542     A->ops->mult                      = MatMult_SeqAIJ;
3543     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3544     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3545     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3546     A->ops->multhermitiantranspose    = NULL;
3547     A->ops->multhermitiantransposeadd = NULL;
3548     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3549     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3550     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3551     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3552     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3553     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3554     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3555     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3556     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3557   } else {
3558     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3559     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3560     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3561     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3562     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3563     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3564     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3565     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3566     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3567     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3568     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3569     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3570     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3571     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3572     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3573     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3574     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580   }
3581   A->boundtocpu = flg;
3582   if (flg && a->inode.size) {
3583     a->inode.use = PETSC_TRUE;
3584   } else {
3585     a->inode.use = PETSC_FALSE;
3586   }
3587   PetscFunctionReturn(0);
3588 }
3589 
3590 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3591 {
3592   PetscErrorCode   ierr;
3593   cusparseStatus_t stat;
3594   Mat              B;
3595 
3596   PetscFunctionBegin;
3597   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3598   if (reuse == MAT_INITIAL_MATRIX) {
3599     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3600   } else if (reuse == MAT_REUSE_MATRIX) {
3601     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3602   }
3603   B = *newmat;
3604 
3605   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3606   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3607 
3608   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3609     if (B->factortype == MAT_FACTOR_NONE) {
3610       Mat_SeqAIJCUSPARSE *spptr;
3611       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3612       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3613       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3614       spptr->format     = MAT_CUSPARSE_CSR;
3615      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3616      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3617       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3618      #else
3619       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3620      #endif
3621       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3622       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3623      #endif
3624       B->spptr = spptr;
3625     } else {
3626       Mat_SeqAIJCUSPARSETriFactors *spptr;
3627 
3628       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3629       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3630       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3631       B->spptr = spptr;
3632     }
3633     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3634   }
3635   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3636   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3637   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3638   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3639   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3640   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3641 
3642   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3643   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3644   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3645 #if defined(PETSC_HAVE_HYPRE)
3646   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3647 #endif
3648   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3649   PetscFunctionReturn(0);
3650 }
3651 
3652 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3653 {
3654   PetscErrorCode ierr;
3655 
3656   PetscFunctionBegin;
3657   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3658   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3659   PetscFunctionReturn(0);
3660 }
3661 
3662 /*MC
3663    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3664 
3665    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3666    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3667    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3668 
3669    Options Database Keys:
3670 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3671 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3672 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3673 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3674 
3675   Level: beginner
3676 
3677 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3678 M*/
3679 
3680 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3681 
3682 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3683 {
3684   PetscErrorCode ierr;
3685 
3686   PetscFunctionBegin;
3687   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3688   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3689   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3690   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3691   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3692 
3693   PetscFunctionReturn(0);
3694 }
3695 
3696 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3697 {
3698   PetscErrorCode   ierr;
3699   cusparseStatus_t stat;
3700 
3701   PetscFunctionBegin;
3702   if (*cusparsestruct) {
3703     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3704     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3705     delete (*cusparsestruct)->workVector;
3706     delete (*cusparsestruct)->rowoffsets_gpu;
3707     delete (*cusparsestruct)->cooPerm;
3708     delete (*cusparsestruct)->cooPerm_a;
3709     delete (*cusparsestruct)->csr2csc_i;
3710     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3711     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3712   }
3713   PetscFunctionReturn(0);
3714 }
3715 
3716 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3717 {
3718   PetscFunctionBegin;
3719   if (*mat) {
3720     delete (*mat)->values;
3721     delete (*mat)->column_indices;
3722     delete (*mat)->row_offsets;
3723     delete *mat;
3724     *mat = 0;
3725   }
3726   PetscFunctionReturn(0);
3727 }
3728 
3729 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3730 {
3731   cusparseStatus_t stat;
3732   PetscErrorCode   ierr;
3733 
3734   PetscFunctionBegin;
3735   if (*trifactor) {
3736     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3737     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3738     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3739     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3740     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3741    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3742     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3743    #endif
3744     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3745   }
3746   PetscFunctionReturn(0);
3747 }
3748 
3749 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3750 {
3751   CsrMatrix        *mat;
3752   cusparseStatus_t stat;
3753   cudaError_t      err;
3754 
3755   PetscFunctionBegin;
3756   if (*matstruct) {
3757     if ((*matstruct)->mat) {
3758       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3759        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3760         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3761        #else
3762         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3763         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3764        #endif
3765       } else {
3766         mat = (CsrMatrix*)(*matstruct)->mat;
3767         CsrMatrix_Destroy(&mat);
3768       }
3769     }
3770     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3771     delete (*matstruct)->cprowIndices;
3772     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3773     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3774     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3775 
3776    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3777     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3778     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3779     for (int i=0; i<3; i++) {
3780       if (mdata->cuSpMV[i].initialized) {
3781         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3782         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3783         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3784       }
3785     }
3786    #endif
3787     delete *matstruct;
3788     *matstruct = NULL;
3789   }
3790   PetscFunctionReturn(0);
3791 }
3792 
3793 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3794 {
3795   PetscErrorCode ierr;
3796 
3797   PetscFunctionBegin;
3798   if (*trifactors) {
3799     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3800     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3801     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3802     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3803     delete (*trifactors)->rpermIndices;
3804     delete (*trifactors)->cpermIndices;
3805     delete (*trifactors)->workVector;
3806     (*trifactors)->rpermIndices = NULL;
3807     (*trifactors)->cpermIndices = NULL;
3808     (*trifactors)->workVector = NULL;
3809     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3810     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3811     (*trifactors)->init_dev_prop = PETSC_FALSE;
3812   }
3813   PetscFunctionReturn(0);
3814 }
3815 
3816 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3817 {
3818   PetscErrorCode   ierr;
3819   cusparseHandle_t handle;
3820   cusparseStatus_t stat;
3821 
3822   PetscFunctionBegin;
3823   if (*trifactors) {
3824     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3825     if (handle = (*trifactors)->handle) {
3826       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3827     }
3828     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3829   }
3830   PetscFunctionReturn(0);
3831 }
3832 
3833 struct IJCompare
3834 {
3835   __host__ __device__
3836   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3837   {
3838     if (t1.get<0>() < t2.get<0>()) return true;
3839     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3840     return false;
3841   }
3842 };
3843 
3844 struct IJEqual
3845 {
3846   __host__ __device__
3847   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3848   {
3849     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3850     return true;
3851   }
3852 };
3853 
3854 struct IJDiff
3855 {
3856   __host__ __device__
3857   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3858   {
3859     return t1 == t2 ? 0 : 1;
3860   }
3861 };
3862 
3863 struct IJSum
3864 {
3865   __host__ __device__
3866   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3867   {
3868     return t1||t2;
3869   }
3870 };
3871 
3872 #include <thrust/iterator/discard_iterator.h>
3873 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3874 {
3875   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3876   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3877   THRUSTARRAY                           *cooPerm_v = NULL;
3878   thrust::device_ptr<const PetscScalar> d_v;
3879   CsrMatrix                             *matrix;
3880   PetscErrorCode                        ierr;
3881   PetscInt                              n;
3882 
3883   PetscFunctionBegin;
3884   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3885   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3886   if (!cusp->cooPerm) {
3887     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3888     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3889     PetscFunctionReturn(0);
3890   }
3891   matrix = (CsrMatrix*)cusp->mat->mat;
3892   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3893   if (!v) {
3894     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3895     goto finalize;
3896   }
3897   n = cusp->cooPerm->size();
3898   if (isCudaMem(v)) {
3899     d_v = thrust::device_pointer_cast(v);
3900   } else {
3901     cooPerm_v = new THRUSTARRAY(n);
3902     cooPerm_v->assign(v,v+n);
3903     d_v = cooPerm_v->data();
3904     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3905   }
3906   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3907   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3908     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3909       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3910       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3911       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3912         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3913         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3914       */
3915       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3916       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3917       delete cooPerm_w;
3918     } else {
3919       /* all nonzeros in d_v[] are unique entries */
3920       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3921                                                                 matrix->values->begin()));
3922       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3923                                                                 matrix->values->end()));
3924       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3925     }
3926   } else {
3927     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3928       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3929       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3930     } else {
3931       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3932                                                                 matrix->values->begin()));
3933       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3934                                                                 matrix->values->end()));
3935       thrust::for_each(zibit,zieit,VecCUDAEquals());
3936     }
3937   }
3938   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3939 finalize:
3940   delete cooPerm_v;
3941   A->offloadmask = PETSC_OFFLOAD_GPU;
3942   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3943   /* shorter version of MatAssemblyEnd_SeqAIJ */
3944   ierr = PetscInfo3(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3945   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3946   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3947   a->reallocs         = 0;
3948   A->info.mallocs    += 0;
3949   A->info.nz_unneeded = 0;
3950   A->assembled = A->was_assembled = PETSC_TRUE;
3951   A->num_ass++;
3952   PetscFunctionReturn(0);
3953 }
3954 
3955 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3956 {
3957   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3958   PetscErrorCode     ierr;
3959 
3960   PetscFunctionBegin;
3961   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3962   if (!cusp) PetscFunctionReturn(0);
3963   if (destroy) {
3964     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3965     delete cusp->csr2csc_i;
3966     cusp->csr2csc_i = NULL;
3967   }
3968   A->transupdated = PETSC_FALSE;
3969   PetscFunctionReturn(0);
3970 }
3971 
3972 #include <thrust/binary_search.h>
3973 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3974 {
3975   PetscErrorCode     ierr;
3976   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3977   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3978   PetscInt           cooPerm_n, nzr = 0;
3979   cudaError_t        cerr;
3980 
3981   PetscFunctionBegin;
3982   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3983   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3984   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3985   if (n != cooPerm_n) {
3986     delete cusp->cooPerm;
3987     delete cusp->cooPerm_a;
3988     cusp->cooPerm = NULL;
3989     cusp->cooPerm_a = NULL;
3990   }
3991   if (n) {
3992     THRUSTINTARRAY d_i(n);
3993     THRUSTINTARRAY d_j(n);
3994     THRUSTINTARRAY ii(A->rmap->n);
3995 
3996     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3997     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3998 
3999     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
4000     d_i.assign(coo_i,coo_i+n);
4001     d_j.assign(coo_j,coo_j+n);
4002 
4003     /* Ex.
4004       n = 6
4005       coo_i = [3,3,1,4,1,4]
4006       coo_j = [3,2,2,5,2,6]
4007     */
4008     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4009     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4010 
4011     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4012     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4013     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4014     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4015     THRUSTINTARRAY w = d_j;
4016 
4017     /*
4018       d_i     = [1,1,3,3,4,4]
4019       d_j     = [2,2,2,3,5,6]
4020       cooPerm = [2,4,1,0,3,5]
4021     */
4022     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4023 
4024     /*
4025       d_i     = [1,3,3,4,4,x]
4026                             ^ekey
4027       d_j     = [2,2,3,5,6,x]
4028                            ^nekye
4029     */
4030     if (nekey == ekey) { /* all entries are unique */
4031       delete cusp->cooPerm_a;
4032       cusp->cooPerm_a = NULL;
4033     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4034       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4035       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4036       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4037       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4038       w[0] = 0;
4039       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4040       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4041     }
4042     thrust::counting_iterator<PetscInt> search_begin(0);
4043     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4044                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4045                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4046     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4047 
4048     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4049     a->singlemalloc = PETSC_FALSE;
4050     a->free_a       = PETSC_TRUE;
4051     a->free_ij      = PETSC_TRUE;
4052     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4053     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4054     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4055     a->nz = a->maxnz = a->i[A->rmap->n];
4056     a->rmax = 0;
4057     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4058     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4059     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4060     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4061     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4062     for (PetscInt i = 0; i < A->rmap->n; i++) {
4063       const PetscInt nnzr = a->i[i+1] - a->i[i];
4064       nzr += (PetscInt)!!(nnzr);
4065       a->ilen[i] = a->imax[i] = nnzr;
4066       a->rmax = PetscMax(a->rmax,nnzr);
4067     }
4068     a->nonzerorowcnt = nzr;
4069     A->preallocated = PETSC_TRUE;
4070     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4071     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4072   } else {
4073     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4074   }
4075   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4076 
4077   /* We want to allocate the CUSPARSE struct for matvec now.
4078      The code is so convoluted now that I prefer to copy zeros */
4079   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4080   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4081   A->offloadmask = PETSC_OFFLOAD_CPU;
4082   A->nonzerostate++;
4083   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4084   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4085 
4086   A->assembled = PETSC_FALSE;
4087   A->was_assembled = PETSC_FALSE;
4088   PetscFunctionReturn(0);
4089 }
4090 
4091 /*@C
4092     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4093 
4094    Not collective
4095 
4096     Input Parameters:
4097 +   A - the matrix
4098 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4099 
4100     Output Parameters:
4101 +   ia - the CSR row pointers
4102 -   ja - the CSR column indices
4103 
4104     Level: developer
4105 
4106     Notes:
4107       When compressed is true, the CSR structure does not contain empty rows
4108 
4109 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4110 @*/
4111 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4112 {
4113   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4114   CsrMatrix          *csr;
4115   PetscErrorCode     ierr;
4116   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4117 
4118   PetscFunctionBegin;
4119   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4120   if (!i || !j) PetscFunctionReturn(0);
4121   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4122   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4123   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4124   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4125   csr = (CsrMatrix*)cusp->mat->mat;
4126   if (i) {
4127     if (!compressed && a->compressedrow.use) { /* need full row offset */
4128       if (!cusp->rowoffsets_gpu) {
4129         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4130         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4131         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4132       }
4133       *i = cusp->rowoffsets_gpu->data().get();
4134     } else *i = csr->row_offsets->data().get();
4135   }
4136   if (j) *j = csr->column_indices->data().get();
4137   PetscFunctionReturn(0);
4138 }
4139 
4140 /*@C
4141     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4142 
4143    Not collective
4144 
4145     Input Parameters:
4146 +   A - the matrix
4147 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4148 
4149     Output Parameters:
4150 +   ia - the CSR row pointers
4151 -   ja - the CSR column indices
4152 
4153     Level: developer
4154 
4155 .seealso: MatSeqAIJCUSPARSEGetIJ()
4156 @*/
4157 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4158 {
4159   PetscFunctionBegin;
4160   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4161   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4162   if (i) *i = NULL;
4163   if (j) *j = NULL;
4164   PetscFunctionReturn(0);
4165 }
4166 
4167 /*@C
4168    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4169 
4170    Not Collective
4171 
4172    Input Parameter:
4173 .   A - a MATSEQAIJCUSPARSE matrix
4174 
4175    Output Parameter:
4176 .   a - pointer to the device data
4177 
4178    Level: developer
4179 
4180    Notes: may trigger host-device copies if up-to-date matrix data is on host
4181 
4182 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4183 @*/
4184 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4185 {
4186   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4187   CsrMatrix          *csr;
4188   PetscErrorCode     ierr;
4189 
4190   PetscFunctionBegin;
4191   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4192   PetscValidPointer(a,2);
4193   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4194   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4195   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4196   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4197   csr = (CsrMatrix*)cusp->mat->mat;
4198   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4199   *a = csr->values->data().get();
4200   PetscFunctionReturn(0);
4201 }
4202 
4203 /*@C
4204    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4205 
4206    Not Collective
4207 
4208    Input Parameter:
4209 .   A - a MATSEQAIJCUSPARSE matrix
4210 
4211    Output Parameter:
4212 .   a - pointer to the device data
4213 
4214    Level: developer
4215 
4216 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4217 @*/
4218 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4219 {
4220   PetscFunctionBegin;
4221   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4222   PetscValidPointer(a,2);
4223   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4224   *a = NULL;
4225   PetscFunctionReturn(0);
4226 }
4227 
4228 /*@C
4229    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4230 
4231    Not Collective
4232 
4233    Input Parameter:
4234 .   A - a MATSEQAIJCUSPARSE matrix
4235 
4236    Output Parameter:
4237 .   a - pointer to the device data
4238 
4239    Level: developer
4240 
4241    Notes: may trigger host-device copies if up-to-date matrix data is on host
4242 
4243 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4244 @*/
4245 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4246 {
4247   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4248   CsrMatrix          *csr;
4249   PetscErrorCode     ierr;
4250 
4251   PetscFunctionBegin;
4252   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4253   PetscValidPointer(a,2);
4254   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4255   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4256   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4257   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4258   csr = (CsrMatrix*)cusp->mat->mat;
4259   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4260   *a = csr->values->data().get();
4261   A->offloadmask = PETSC_OFFLOAD_GPU;
4262   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4263   PetscFunctionReturn(0);
4264 }
4265 /*@C
4266    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4267 
4268    Not Collective
4269 
4270    Input Parameter:
4271 .   A - a MATSEQAIJCUSPARSE matrix
4272 
4273    Output Parameter:
4274 .   a - pointer to the device data
4275 
4276    Level: developer
4277 
4278 .seealso: MatSeqAIJCUSPARSEGetArray()
4279 @*/
4280 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4281 {
4282   PetscErrorCode ierr;
4283 
4284   PetscFunctionBegin;
4285   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4286   PetscValidPointer(a,2);
4287   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4288   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4289   *a = NULL;
4290   PetscFunctionReturn(0);
4291 }
4292 
4293 /*@C
4294    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4295 
4296    Not Collective
4297 
4298    Input Parameter:
4299 .   A - a MATSEQAIJCUSPARSE matrix
4300 
4301    Output Parameter:
4302 .   a - pointer to the device data
4303 
4304    Level: developer
4305 
4306    Notes: does not trigger host-device copies and flags data validity on the GPU
4307 
4308 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4309 @*/
4310 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4311 {
4312   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4313   CsrMatrix          *csr;
4314   PetscErrorCode     ierr;
4315 
4316   PetscFunctionBegin;
4317   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4318   PetscValidPointer(a,2);
4319   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4320   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4321   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4322   csr = (CsrMatrix*)cusp->mat->mat;
4323   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4324   *a = csr->values->data().get();
4325   A->offloadmask = PETSC_OFFLOAD_GPU;
4326   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4327   PetscFunctionReturn(0);
4328 }
4329 
4330 /*@C
4331    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4332 
4333    Not Collective
4334 
4335    Input Parameter:
4336 .   A - a MATSEQAIJCUSPARSE matrix
4337 
4338    Output Parameter:
4339 .   a - pointer to the device data
4340 
4341    Level: developer
4342 
4343 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4344 @*/
4345 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4346 {
4347   PetscErrorCode ierr;
4348 
4349   PetscFunctionBegin;
4350   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4351   PetscValidPointer(a,2);
4352   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4353   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4354   *a = NULL;
4355   PetscFunctionReturn(0);
4356 }
4357 
4358 struct IJCompare4
4359 {
4360   __host__ __device__
4361   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4362   {
4363     if (t1.get<0>() < t2.get<0>()) return true;
4364     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4365     return false;
4366   }
4367 };
4368 
4369 struct Shift
4370 {
4371   int _shift;
4372 
4373   Shift(int shift) : _shift(shift) {}
4374   __host__ __device__
4375   inline int operator() (const int &c)
4376   {
4377     return c + _shift;
4378   }
4379 };
4380 
4381 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4382 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4383 {
4384   PetscErrorCode               ierr;
4385   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4386   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4387   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4388   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4389   PetscInt                     Annz,Bnnz;
4390   cusparseStatus_t             stat;
4391   PetscInt                     i,m,n,zero = 0;
4392   cudaError_t                  cerr;
4393 
4394   PetscFunctionBegin;
4395   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4396   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4397   PetscValidPointer(C,4);
4398   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4399   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4400   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4401   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4402   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4403   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4404   if (reuse == MAT_INITIAL_MATRIX) {
4405     m     = A->rmap->n;
4406     n     = A->cmap->n + B->cmap->n;
4407     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4408     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4409     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4410     c     = (Mat_SeqAIJ*)(*C)->data;
4411     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4412     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4413     Ccsr  = new CsrMatrix;
4414     Cmat->cprowIndices      = NULL;
4415     c->compressedrow.use    = PETSC_FALSE;
4416     c->compressedrow.nrows  = 0;
4417     c->compressedrow.i      = NULL;
4418     c->compressedrow.rindex = NULL;
4419     Ccusp->workVector       = NULL;
4420     Ccusp->nrows    = m;
4421     Ccusp->mat      = Cmat;
4422     Ccusp->mat->mat = Ccsr;
4423     Ccsr->num_rows  = m;
4424     Ccsr->num_cols  = n;
4425     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4426     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4427     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4428     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4429     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4430     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4431     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4433     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4434     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4435     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4436     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4437     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4438 
4439     Acsr = (CsrMatrix*)Acusp->mat->mat;
4440     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4441     Annz = (PetscInt)Acsr->column_indices->size();
4442     Bnnz = (PetscInt)Bcsr->column_indices->size();
4443     c->nz = Annz + Bnnz;
4444     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4445     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4446     Ccsr->values = new THRUSTARRAY(c->nz);
4447     Ccsr->num_entries = c->nz;
4448     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4449     if (c->nz) {
4450       auto Acoo = new THRUSTINTARRAY32(Annz);
4451       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4452       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4453       THRUSTINTARRAY32 *Aroff,*Broff;
4454 
4455       if (a->compressedrow.use) { /* need full row offset */
4456         if (!Acusp->rowoffsets_gpu) {
4457           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4458           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4459           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4460         }
4461         Aroff = Acusp->rowoffsets_gpu;
4462       } else Aroff = Acsr->row_offsets;
4463       if (b->compressedrow.use) { /* need full row offset */
4464         if (!Bcusp->rowoffsets_gpu) {
4465           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4466           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4467           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4468         }
4469         Broff = Bcusp->rowoffsets_gpu;
4470       } else Broff = Bcsr->row_offsets;
4471       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4472       stat = cusparseXcsr2coo(Acusp->handle,
4473                               Aroff->data().get(),
4474                               Annz,
4475                               m,
4476                               Acoo->data().get(),
4477                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4478       stat = cusparseXcsr2coo(Bcusp->handle,
4479                               Broff->data().get(),
4480                               Bnnz,
4481                               m,
4482                               Bcoo->data().get(),
4483                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4484       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4485       auto Aperm = thrust::make_constant_iterator(1);
4486       auto Bperm = thrust::make_constant_iterator(0);
4487 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4488       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4489       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4490 #else
4491       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4492       auto Bcib = Bcsr->column_indices->begin();
4493       auto Bcie = Bcsr->column_indices->end();
4494       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4495 #endif
4496       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4497       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4498       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4499       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4500       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4501       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4502       auto p1 = Ccusp->cooPerm->begin();
4503       auto p2 = Ccusp->cooPerm->begin();
4504       thrust::advance(p2,Annz);
4505       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4506 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4507       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4508 #endif
4509       auto cci = thrust::make_counting_iterator(zero);
4510       auto cce = thrust::make_counting_iterator(c->nz);
4511 #if 0 //Errors on SUMMIT cuda 11.1.0
4512       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4513 #else
4514       auto pred = thrust::identity<int>();
4515       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4516       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4517 #endif
4518       stat = cusparseXcoo2csr(Ccusp->handle,
4519                               Ccoo->data().get(),
4520                               c->nz,
4521                               m,
4522                               Ccsr->row_offsets->data().get(),
4523                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4524       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4525       delete wPerm;
4526       delete Acoo;
4527       delete Bcoo;
4528       delete Ccoo;
4529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4530       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4531                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4532                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4533                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4534 #endif
4535       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4536         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4537         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4538         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4539         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4540         CsrMatrix *CcsrT = new CsrMatrix;
4541         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4542         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4543 
4544         (*C)->form_explicit_transpose = PETSC_TRUE;
4545         (*C)->transupdated = PETSC_TRUE;
4546         Ccusp->rowoffsets_gpu = NULL;
4547         CmatT->cprowIndices = NULL;
4548         CmatT->mat = CcsrT;
4549         CcsrT->num_rows = n;
4550         CcsrT->num_cols = m;
4551         CcsrT->num_entries = c->nz;
4552 
4553         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4554         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4555         CcsrT->values = new THRUSTARRAY(c->nz);
4556 
4557         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4558         auto rT = CcsrT->row_offsets->begin();
4559         if (AT) {
4560           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4561           thrust::advance(rT,-1);
4562         }
4563         if (BT) {
4564           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4565           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4566           thrust::copy(titb,tite,rT);
4567         }
4568         auto cT = CcsrT->column_indices->begin();
4569         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4570         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4571         auto vT = CcsrT->values->begin();
4572         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4573         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4574         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4575 
4576         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4577         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4578         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4579         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4580         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4581         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4582         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4584         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4585 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4586         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4587                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4588                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4589                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4590 #endif
4591         Ccusp->matTranspose = CmatT;
4592       }
4593     }
4594 
4595     c->singlemalloc = PETSC_FALSE;
4596     c->free_a       = PETSC_TRUE;
4597     c->free_ij      = PETSC_TRUE;
4598     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4599     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4600     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4601       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4602       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4603       ii   = *Ccsr->row_offsets;
4604       jj   = *Ccsr->column_indices;
4605       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4607     } else {
4608       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4610     }
4611     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4612     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4613     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4614     c->maxnz = c->nz;
4615     c->nonzerorowcnt = 0;
4616     c->rmax = 0;
4617     for (i = 0; i < m; i++) {
4618       const PetscInt nn = c->i[i+1] - c->i[i];
4619       c->ilen[i] = c->imax[i] = nn;
4620       c->nonzerorowcnt += (PetscInt)!!nn;
4621       c->rmax = PetscMax(c->rmax,nn);
4622     }
4623     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4624     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4625     (*C)->nonzerostate++;
4626     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4627     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4628     Ccusp->nonzerostate = (*C)->nonzerostate;
4629     (*C)->preallocated  = PETSC_TRUE;
4630   } else {
4631     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4632     c = (Mat_SeqAIJ*)(*C)->data;
4633     if (c->nz) {
4634       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4635       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4636       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4637       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4638       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4639       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4640       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4641       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4642       Acsr = (CsrMatrix*)Acusp->mat->mat;
4643       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4644       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4645       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4646       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4647       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4648       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4649       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4650       auto pmid = Ccusp->cooPerm->begin();
4651       thrust::advance(pmid,Acsr->num_entries);
4652       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4653       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4654                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4655       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4656                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4657       thrust::for_each(zibait,zieait,VecCUDAEquals());
4658       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4659                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4660       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4661                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4662       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4663       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4664       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4665         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4666         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4667         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4668         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4669         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4670         auto vT = CcsrT->values->begin();
4671         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4672         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4673         (*C)->transupdated = PETSC_TRUE;
4674       }
4675       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4676     }
4677   }
4678   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4679   (*C)->assembled     = PETSC_TRUE;
4680   (*C)->was_assembled = PETSC_FALSE;
4681   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4682   PetscFunctionReturn(0);
4683 }
4684 
4685 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4686 {
4687   PetscErrorCode    ierr;
4688   bool              dmem;
4689   const PetscScalar *av;
4690   cudaError_t       cerr;
4691 
4692   PetscFunctionBegin;
4693   dmem = isCudaMem(v);
4694   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4695   if (n && idx) {
4696     THRUSTINTARRAY widx(n);
4697     widx.assign(idx,idx+n);
4698     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4699 
4700     THRUSTARRAY *w = NULL;
4701     thrust::device_ptr<PetscScalar> dv;
4702     if (dmem) {
4703       dv = thrust::device_pointer_cast(v);
4704     } else {
4705       w = new THRUSTARRAY(n);
4706       dv = w->data();
4707     }
4708     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4709 
4710     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4711     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4712     thrust::for_each(zibit,zieit,VecCUDAEquals());
4713     if (w) {
4714       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4715     }
4716     delete w;
4717   } else {
4718     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4719   }
4720   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4721   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4722   PetscFunctionReturn(0);
4723 }
4724