xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision f97672e55eacc8688507b9471cd7ec2664d7f203)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
96 {
97   PetscFunctionBegin;
98   *type = MATSOLVERCUSPARSE;
99   PetscFunctionReturn(0);
100 }
101 
102 /*MC
103   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108   algorithms are not recommended. This class does NOT support direct solver operations.
109 
110   Level: beginner
111 
112 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
113 M*/
114 
115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
116 {
117   PetscInt       n = A->rmap->n;
118 
119   PetscFunctionBegin;
120   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
121   PetscCall(MatSetSizes(*B,n,n,n,n));
122   (*B)->factortype = ftype;
123   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
124 
125   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
127     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
128     if (!A->boundtocpu) {
129       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
130       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
131     } else {
132       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
133       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
134     }
135     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
136     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
137     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
139     if (!A->boundtocpu) {
140       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
142     } else {
143       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
144       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
145     }
146     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
147     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
148   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149 
150   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
151   (*B)->canuseordering = PETSC_TRUE;
152   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
153   PetscFunctionReturn(0);
154 }
155 
156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157 {
158   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
159 
160   PetscFunctionBegin;
161   switch (op) {
162   case MAT_CUSPARSE_MULT:
163     cusparsestruct->format = format;
164     break;
165   case MAT_CUSPARSE_ALL:
166     cusparsestruct->format = format;
167     break;
168   default:
169     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@
175    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176    operation. Only the MatMult operation can use different GPU storage formats
177    for MPIAIJCUSPARSE matrices.
178    Not Collective
179 
180    Input Parameters:
181 +  A - Matrix of type SEQAIJCUSPARSE
182 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
183 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184 
185    Output Parameter:
186 
187    Level: intermediate
188 
189 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
190 @*/
191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192 {
193   PetscFunctionBegin;
194   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196   PetscFunctionReturn(0);
197 }
198 
199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200 {
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203   PetscFunctionBegin;
204   cusparsestruct->use_cpu_solve = use_cpu;
205   PetscFunctionReturn(0);
206 }
207 
208 /*@
209    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210 
211    Input Parameters:
212 +  A - Matrix of type SEQAIJCUSPARSE
213 -  use_cpu - set flag for using the built-in CPU MatSolve
214 
215    Output Parameter:
216 
217    Notes:
218    The cuSparse LU solver currently computes the factors with the built-in CPU method
219    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221 
222    Level: intermediate
223 
224 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
225 @*/
226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231   PetscFunctionReturn(0);
232 }
233 
234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235 {
236   PetscFunctionBegin;
237   switch (op) {
238     case MAT_FORM_EXPLICIT_TRANSPOSE:
239       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
240       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
241       A->form_explicit_transpose = flg;
242       break;
243     default:
244       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
245       break;
246   }
247   PetscFunctionReturn(0);
248 }
249 
250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251 
252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253 {
254   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255   IS             isrow = b->row,iscol = b->col;
256   PetscBool      row_identity,col_identity;
257   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258 
259   PetscFunctionBegin;
260   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
261   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262   B->offloadmask = PETSC_OFFLOAD_CPU;
263   /* determine which version of MatSolve needs to be used. */
264   PetscCall(ISIdentity(isrow,&row_identity));
265   PetscCall(ISIdentity(iscol,&col_identity));
266   if (row_identity && col_identity) {
267     if (!cusparsestruct->use_cpu_solve) {
268       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
269       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
270     }
271     B->ops->matsolve = NULL;
272     B->ops->matsolvetranspose = NULL;
273   } else {
274     if (!cusparsestruct->use_cpu_solve) {
275       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
276       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
277     }
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   }
281 
282   /* get the triangular factors */
283   if (!cusparsestruct->use_cpu_solve) {
284     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
290 {
291   MatCUSPARSEStorageFormat format;
292   PetscBool                flg;
293   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
294 
295   PetscFunctionBegin;
296   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
297   if (A->factortype == MAT_FACTOR_NONE) {
298     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
299                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
300     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
301 
302     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
303                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
304     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
305     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
306     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
308     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
309                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
310     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
311 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
312     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
313 #else
314     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
315 #endif
316     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
317                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
318     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
319 
320     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
321                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
322     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
323    #endif
324   }
325   PetscOptionsHeadEnd();
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332 
333   PetscFunctionBegin;
334   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
335   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
336   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
341 {
342   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
343 
344   PetscFunctionBegin;
345   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
346   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
347   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
348   PetscFunctionReturn(0);
349 }
350 
351 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
352 {
353   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
354 
355   PetscFunctionBegin;
356   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
357   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
358   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
359   PetscFunctionReturn(0);
360 }
361 
362 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
363 {
364   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
365 
366   PetscFunctionBegin;
367   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
368   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
369   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
374 {
375   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
376   PetscInt                          n = A->rmap->n;
377   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
379   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
380   const MatScalar                   *aa = a->a,*v;
381   PetscInt                          *AiLo, *AjLo;
382   PetscInt                          i,nz, nzLower, offset, rowOffset;
383 
384   PetscFunctionBegin;
385   if (!n) PetscFunctionReturn(0);
386   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
387     try {
388       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
389       nzLower=n+ai[n]-ai[1];
390       if (!loTriFactor) {
391         PetscScalar                       *AALo;
392 
393         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
394 
395         /* Allocate Space for the lower triangular matrix */
396         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
397         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
398 
399         /* Fill the lower triangular matrix */
400         AiLo[0]  = (PetscInt) 0;
401         AiLo[n]  = nzLower;
402         AjLo[0]  = (PetscInt) 0;
403         AALo[0]  = (MatScalar) 1.0;
404         v        = aa;
405         vi       = aj;
406         offset   = 1;
407         rowOffset= 1;
408         for (i=1; i<n; i++) {
409           nz = ai[i+1] - ai[i];
410           /* additional 1 for the term on the diagonal */
411           AiLo[i]    = rowOffset;
412           rowOffset += nz+1;
413 
414           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
415           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
416 
417           offset      += nz;
418           AjLo[offset] = (PetscInt) i;
419           AALo[offset] = (MatScalar) 1.0;
420           offset      += 1;
421 
422           v  += nz;
423           vi += nz;
424         }
425 
426         /* allocate space for the triangular factor information */
427         PetscCall(PetscNew(&loTriFactor));
428         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
429         /* Create the matrix description */
430         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
431         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
432        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
433         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
434        #else
435         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
436        #endif
437         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
438         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
439 
440         /* set the operation */
441         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
442 
443         /* set the matrix */
444         loTriFactor->csrMat = new CsrMatrix;
445         loTriFactor->csrMat->num_rows = n;
446         loTriFactor->csrMat->num_cols = n;
447         loTriFactor->csrMat->num_entries = nzLower;
448 
449         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
450         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
451 
452         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
453         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
454 
455         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
456         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
457 
458         /* Create the solve analysis information */
459         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
460         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
461       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
462         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
463                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
464                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
465                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
466                                                &loTriFactor->solveBufferSize));
467         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
468       #endif
469 
470         /* perform the solve analysis */
471         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
472                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
473                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
474                                          loTriFactor->csrMat->column_indices->data().get(),
475                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
476                                          loTriFactor->solveInfo,
477                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
478                                          #else
479                                          loTriFactor->solveInfo));
480                                          #endif
481         PetscCallCUDA(WaitForCUDA());
482         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
483 
484         /* assign the pointer */
485         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
486         loTriFactor->AA_h = AALo;
487         PetscCallCUDA(cudaFreeHost(AiLo));
488         PetscCallCUDA(cudaFreeHost(AjLo));
489         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
490       } else { /* update values only */
491         if (!loTriFactor->AA_h) {
492           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
493         }
494         /* Fill the lower triangular matrix */
495         loTriFactor->AA_h[0]  = 1.0;
496         v        = aa;
497         vi       = aj;
498         offset   = 1;
499         for (i=1; i<n; i++) {
500           nz = ai[i+1] - ai[i];
501           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
502           offset      += nz;
503           loTriFactor->AA_h[offset] = 1.0;
504           offset      += 1;
505           v  += nz;
506         }
507         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
508         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
509       }
510     } catch(char *ex) {
511       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
512     }
513   }
514   PetscFunctionReturn(0);
515 }
516 
517 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
518 {
519   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
520   PetscInt                          n = A->rmap->n;
521   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
522   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
523   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
524   const MatScalar                   *aa = a->a,*v;
525   PetscInt                          *AiUp, *AjUp;
526   PetscInt                          i,nz, nzUpper, offset;
527 
528   PetscFunctionBegin;
529   if (!n) PetscFunctionReturn(0);
530   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
531     try {
532       /* next, figure out the number of nonzeros in the upper triangular matrix. */
533       nzUpper = adiag[0]-adiag[n];
534       if (!upTriFactor) {
535         PetscScalar *AAUp;
536 
537         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
538 
539         /* Allocate Space for the upper triangular matrix */
540         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
541         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
542 
543         /* Fill the upper triangular matrix */
544         AiUp[0]=(PetscInt) 0;
545         AiUp[n]=nzUpper;
546         offset = nzUpper;
547         for (i=n-1; i>=0; i--) {
548           v  = aa + adiag[i+1] + 1;
549           vi = aj + adiag[i+1] + 1;
550 
551           /* number of elements NOT on the diagonal */
552           nz = adiag[i] - adiag[i+1]-1;
553 
554           /* decrement the offset */
555           offset -= (nz+1);
556 
557           /* first, set the diagonal elements */
558           AjUp[offset] = (PetscInt) i;
559           AAUp[offset] = (MatScalar)1./v[nz];
560           AiUp[i]      = AiUp[i+1] - (nz+1);
561 
562           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
563           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
564         }
565 
566         /* allocate space for the triangular factor information */
567         PetscCall(PetscNew(&upTriFactor));
568         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
569 
570         /* Create the matrix description */
571         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
572         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
573        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
574         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
575        #else
576         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
577        #endif
578         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
579         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
580 
581         /* set the operation */
582         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
583 
584         /* set the matrix */
585         upTriFactor->csrMat = new CsrMatrix;
586         upTriFactor->csrMat->num_rows = n;
587         upTriFactor->csrMat->num_cols = n;
588         upTriFactor->csrMat->num_entries = nzUpper;
589 
590         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
591         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
592 
593         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
594         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
595 
596         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
597         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
598 
599         /* Create the solve analysis information */
600         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
601         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
602       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
603         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
604                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
605                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
606                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
607                                                &upTriFactor->solveBufferSize));
608         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
609       #endif
610 
611         /* perform the solve analysis */
612         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
613                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
614                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
615                                          upTriFactor->csrMat->column_indices->data().get(),
616                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
617                                          upTriFactor->solveInfo,
618                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
619                                          #else
620                                          upTriFactor->solveInfo));
621                                          #endif
622         PetscCallCUDA(WaitForCUDA());
623         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
624 
625         /* assign the pointer */
626         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
627         upTriFactor->AA_h = AAUp;
628         PetscCallCUDA(cudaFreeHost(AiUp));
629         PetscCallCUDA(cudaFreeHost(AjUp));
630         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
631       } else {
632         if (!upTriFactor->AA_h) {
633           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
634         }
635         /* Fill the upper triangular matrix */
636         offset = nzUpper;
637         for (i=n-1; i>=0; i--) {
638           v  = aa + adiag[i+1] + 1;
639 
640           /* number of elements NOT on the diagonal */
641           nz = adiag[i] - adiag[i+1]-1;
642 
643           /* decrement the offset */
644           offset -= (nz+1);
645 
646           /* first, set the diagonal elements */
647           upTriFactor->AA_h[offset] = 1./v[nz];
648           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
649         }
650         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
651         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
652       }
653     } catch(char *ex) {
654       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
655     }
656   }
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
661 {
662   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
663   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
664   IS                           isrow = a->row,iscol = a->icol;
665   PetscBool                    row_identity,col_identity;
666   PetscInt                     n = A->rmap->n;
667 
668   PetscFunctionBegin;
669   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
670   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
671   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
672 
673   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
674   cusparseTriFactors->nnz=a->nz;
675 
676   A->offloadmask = PETSC_OFFLOAD_BOTH;
677   /* lower triangular indices */
678   PetscCall(ISIdentity(isrow,&row_identity));
679   if (!row_identity && !cusparseTriFactors->rpermIndices) {
680     const PetscInt *r;
681 
682     PetscCall(ISGetIndices(isrow,&r));
683     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
684     cusparseTriFactors->rpermIndices->assign(r, r+n);
685     PetscCall(ISRestoreIndices(isrow,&r));
686     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
687   }
688 
689   /* upper triangular indices */
690   PetscCall(ISIdentity(iscol,&col_identity));
691   if (!col_identity && !cusparseTriFactors->cpermIndices) {
692     const PetscInt *c;
693 
694     PetscCall(ISGetIndices(iscol,&c));
695     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
696     cusparseTriFactors->cpermIndices->assign(c, c+n);
697     PetscCall(ISRestoreIndices(iscol,&c));
698     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
699   }
700   PetscFunctionReturn(0);
701 }
702 
703 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
704 {
705   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
706   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
708   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
709   PetscInt                          *AiUp, *AjUp;
710   PetscScalar                       *AAUp;
711   PetscScalar                       *AALo;
712   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
713   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
714   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
715   const MatScalar                   *aa = b->a,*v;
716 
717   PetscFunctionBegin;
718   if (!n) PetscFunctionReturn(0);
719   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
720     try {
721       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
722       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
723       if (!upTriFactor && !loTriFactor) {
724         /* Allocate Space for the upper triangular matrix */
725         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
726         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
727 
728         /* Fill the upper triangular matrix */
729         AiUp[0]=(PetscInt) 0;
730         AiUp[n]=nzUpper;
731         offset = 0;
732         for (i=0; i<n; i++) {
733           /* set the pointers */
734           v  = aa + ai[i];
735           vj = aj + ai[i];
736           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
737 
738           /* first, set the diagonal elements */
739           AjUp[offset] = (PetscInt) i;
740           AAUp[offset] = (MatScalar)1.0/v[nz];
741           AiUp[i]      = offset;
742           AALo[offset] = (MatScalar)1.0/v[nz];
743 
744           offset+=1;
745           if (nz>0) {
746             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
747             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
748             for (j=offset; j<offset+nz; j++) {
749               AAUp[j] = -AAUp[j];
750               AALo[j] = AAUp[j]/v[nz];
751             }
752             offset+=nz;
753           }
754         }
755 
756         /* allocate space for the triangular factor information */
757         PetscCall(PetscNew(&upTriFactor));
758         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
759 
760         /* Create the matrix description */
761         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
762         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
763        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
764         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
765        #else
766         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
767        #endif
768         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
769         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
770 
771         /* set the matrix */
772         upTriFactor->csrMat = new CsrMatrix;
773         upTriFactor->csrMat->num_rows = A->rmap->n;
774         upTriFactor->csrMat->num_cols = A->cmap->n;
775         upTriFactor->csrMat->num_entries = a->nz;
776 
777         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
778         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
779 
780         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
781         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
782 
783         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
784         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
785 
786         /* set the operation */
787         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
788 
789         /* Create the solve analysis information */
790         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
791         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
792       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
793         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
794                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
795                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
796                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
797                                                &upTriFactor->solveBufferSize));
798         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
799       #endif
800 
801         /* perform the solve analysis */
802         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
803                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
804                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
805                                          upTriFactor->csrMat->column_indices->data().get(),
806                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
807                                          upTriFactor->solveInfo,
808                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
809                                          #else
810                                          upTriFactor->solveInfo));
811                                          #endif
812         PetscCallCUDA(WaitForCUDA());
813         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
814 
815         /* assign the pointer */
816         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
817 
818         /* allocate space for the triangular factor information */
819         PetscCall(PetscNew(&loTriFactor));
820         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
821 
822         /* Create the matrix description */
823         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
824         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
825        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
826         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
827        #else
828         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
829        #endif
830         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
831         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
832 
833         /* set the operation */
834         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
835 
836         /* set the matrix */
837         loTriFactor->csrMat = new CsrMatrix;
838         loTriFactor->csrMat->num_rows = A->rmap->n;
839         loTriFactor->csrMat->num_cols = A->cmap->n;
840         loTriFactor->csrMat->num_entries = a->nz;
841 
842         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
843         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
844 
845         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
846         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
847 
848         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
849         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
850 
851         /* Create the solve analysis information */
852         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
853         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
854       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
855         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
856                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
857                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
858                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
859                                                &loTriFactor->solveBufferSize));
860         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
861       #endif
862 
863         /* perform the solve analysis */
864         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
865                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
866                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
867                                          loTriFactor->csrMat->column_indices->data().get(),
868                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
869                                          loTriFactor->solveInfo,
870                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
871                                          #else
872                                          loTriFactor->solveInfo));
873                                          #endif
874         PetscCallCUDA(WaitForCUDA());
875         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
876 
877         /* assign the pointer */
878         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
879 
880         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
881         PetscCallCUDA(cudaFreeHost(AiUp));
882         PetscCallCUDA(cudaFreeHost(AjUp));
883       } else {
884         /* Fill the upper triangular matrix */
885         offset = 0;
886         for (i=0; i<n; i++) {
887           /* set the pointers */
888           v  = aa + ai[i];
889           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
890 
891           /* first, set the diagonal elements */
892           AAUp[offset] = 1.0/v[nz];
893           AALo[offset] = 1.0/v[nz];
894 
895           offset+=1;
896           if (nz>0) {
897             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
898             for (j=offset; j<offset+nz; j++) {
899               AAUp[j] = -AAUp[j];
900               AALo[j] = AAUp[j]/v[nz];
901             }
902             offset+=nz;
903           }
904         }
905         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
906         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
908         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
909         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
910       }
911       PetscCallCUDA(cudaFreeHost(AAUp));
912       PetscCallCUDA(cudaFreeHost(AALo));
913     } catch(char *ex) {
914       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
915     }
916   }
917   PetscFunctionReturn(0);
918 }
919 
920 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
921 {
922   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
923   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
924   IS                           ip = a->row;
925   PetscBool                    perm_identity;
926   PetscInt                     n = A->rmap->n;
927 
928   PetscFunctionBegin;
929   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
930   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
931   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
932   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
933 
934   A->offloadmask = PETSC_OFFLOAD_BOTH;
935 
936   /* lower triangular indices */
937   PetscCall(ISIdentity(ip,&perm_identity));
938   if (!perm_identity) {
939     IS             iip;
940     const PetscInt *irip,*rip;
941 
942     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
943     PetscCall(ISGetIndices(iip,&irip));
944     PetscCall(ISGetIndices(ip,&rip));
945     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
946     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
947     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
948     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
949     PetscCall(ISRestoreIndices(iip,&irip));
950     PetscCall(ISDestroy(&iip));
951     PetscCall(ISRestoreIndices(ip,&rip));
952     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
953   }
954   PetscFunctionReturn(0);
955 }
956 
957 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
958 {
959   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
960   IS             ip = b->row;
961   PetscBool      perm_identity;
962 
963   PetscFunctionBegin;
964   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
965   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
966   B->offloadmask = PETSC_OFFLOAD_CPU;
967   /* determine which version of MatSolve needs to be used. */
968   PetscCall(ISIdentity(ip,&perm_identity));
969   if (perm_identity) {
970     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
971     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
972     B->ops->matsolve = NULL;
973     B->ops->matsolvetranspose = NULL;
974   } else {
975     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
976     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
977     B->ops->matsolve = NULL;
978     B->ops->matsolvetranspose = NULL;
979   }
980 
981   /* get the triangular factors */
982   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
983   PetscFunctionReturn(0);
984 }
985 
986 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
987 {
988   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
989   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
990   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
991   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
992   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
993   cusparseIndexBase_t               indexBase;
994   cusparseMatrixType_t              matrixType;
995   cusparseFillMode_t                fillMode;
996   cusparseDiagType_t                diagType;
997 
998   PetscFunctionBegin;
999   /* allocate space for the transpose of the lower triangular factor */
1000   PetscCall(PetscNew(&loTriFactorT));
1001   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002 
1003   /* set the matrix descriptors of the lower triangular factor */
1004   matrixType = cusparseGetMatType(loTriFactor->descr);
1005   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009 
1010   /* Create the matrix description */
1011   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1012   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1013   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1014   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1015   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1016 
1017   /* set the operation */
1018   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019 
1020   /* allocate GPU space for the CSC of the lower triangular factor*/
1021   loTriFactorT->csrMat = new CsrMatrix;
1022   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028 
1029   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033                                                loTriFactor->csrMat->values->data().get(),
1034                                                loTriFactor->csrMat->row_offsets->data().get(),
1035                                                loTriFactor->csrMat->column_indices->data().get(),
1036                                                loTriFactorT->csrMat->values->data().get(),
1037                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1039                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1040   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1041 #endif
1042 
1043   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1044   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046                                   loTriFactor->csrMat->values->data().get(),
1047                                   loTriFactor->csrMat->row_offsets->data().get(),
1048                                   loTriFactor->csrMat->column_indices->data().get(),
1049                                   loTriFactorT->csrMat->values->data().get(),
1050                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1053                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1054                                   #else
1055                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1057                                   #endif
1058   PetscCallCUDA(WaitForCUDA());
1059   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1060 
1061   /* Create the solve analysis information */
1062   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1063   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1065   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1069                                          &loTriFactorT->solveBufferSize));
1070   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1071 #endif
1072 
1073   /* perform the solve analysis */
1074   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077                                    loTriFactorT->csrMat->column_indices->data().get(),
1078                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079                                    loTriFactorT->solveInfo,
1080                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1081                                    #else
1082                                    loTriFactorT->solveInfo));
1083                                    #endif
1084   PetscCallCUDA(WaitForCUDA());
1085   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1086 
1087   /* assign the pointer */
1088   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1089 
1090   /*********************************************/
1091   /* Now the Transpose of the Upper Tri Factor */
1092   /*********************************************/
1093 
1094   /* allocate space for the transpose of the upper triangular factor */
1095   PetscCall(PetscNew(&upTriFactorT));
1096   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1097 
1098   /* set the matrix descriptors of the upper triangular factor */
1099   matrixType = cusparseGetMatType(upTriFactor->descr);
1100   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1101   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1102     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1103   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1104 
1105   /* Create the matrix description */
1106   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1107   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1108   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1109   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1110   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1111 
1112   /* set the operation */
1113   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1114 
1115   /* allocate GPU space for the CSC of the upper triangular factor*/
1116   upTriFactorT->csrMat = new CsrMatrix;
1117   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1118   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1119   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1120   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1121   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1122   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1123 
1124   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1125 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1126   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1127                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1128                                                upTriFactor->csrMat->values->data().get(),
1129                                                upTriFactor->csrMat->row_offsets->data().get(),
1130                                                upTriFactor->csrMat->column_indices->data().get(),
1131                                                upTriFactorT->csrMat->values->data().get(),
1132                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1133                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1134                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1135   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1136 #endif
1137 
1138   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1139   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1140                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                   upTriFactor->csrMat->values->data().get(),
1142                                   upTriFactor->csrMat->row_offsets->data().get(),
1143                                   upTriFactor->csrMat->column_indices->data().get(),
1144                                   upTriFactorT->csrMat->values->data().get(),
1145                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1146                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1147                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1148                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1149                                   #else
1150                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1151                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1152                                  #endif
1153 
1154   PetscCallCUDA(WaitForCUDA());
1155   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1156 
1157   /* Create the solve analysis information */
1158   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1159   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1160   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1161   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1162                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1163                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1165                                          &upTriFactorT->solveBufferSize));
1166   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1167   #endif
1168 
1169   /* perform the solve analysis */
1170   /* christ, would it have killed you to put this stuff in a function????????? */
1171   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174                                    upTriFactorT->csrMat->column_indices->data().get(),
1175                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1176                                    upTriFactorT->solveInfo,
1177                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1178                                    #else
1179                                    upTriFactorT->solveInfo));
1180                                    #endif
1181 
1182   PetscCallCUDA(WaitForCUDA());
1183   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1184 
1185   /* assign the pointer */
1186   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 struct PetscScalarToPetscInt
1191 {
1192   __host__ __device__
1193   PetscInt operator()(PetscScalar s)
1194   {
1195     return (PetscInt)PetscRealPart(s);
1196   }
1197 };
1198 
1199 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1200 {
1201   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1203   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1204   cusparseStatus_t             stat;
1205   cusparseIndexBase_t          indexBase;
1206 
1207   PetscFunctionBegin;
1208   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1209   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1210   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1211   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1212   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1213   if (A->transupdated) PetscFunctionReturn(0);
1214   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1215   PetscCall(PetscLogGpuTimeBegin());
1216   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1217     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1218   }
1219   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1220     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1221     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1222     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1223     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1224     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1225 
1226     /* set alpha and beta */
1227     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1228     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1229     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1230     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1231     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1232     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1233 
1234     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1235       CsrMatrix *matrixT = new CsrMatrix;
1236       matstructT->mat = matrixT;
1237       matrixT->num_rows = A->cmap->n;
1238       matrixT->num_cols = A->rmap->n;
1239       matrixT->num_entries = a->nz;
1240       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1241       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1242       matrixT->values = new THRUSTARRAY(a->nz);
1243 
1244       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1245       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1246 
1247      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1248       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1249         stat = cusparseCreateCsr(&matstructT->matDescr,
1250                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1251                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1252                                matrixT->values->data().get(),
1253                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1254                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1255       #else
1256         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1257            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1258 
1259            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1260            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1261            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1262         */
1263         if (matrixT->num_entries) {
1264           stat = cusparseCreateCsr(&matstructT->matDescr,
1265                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1266                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1267                                  matrixT->values->data().get(),
1268                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1269                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1270 
1271         } else {
1272           matstructT->matDescr = NULL;
1273           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1274         }
1275       #endif
1276      #endif
1277     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1278    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1279       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1280    #else
1281       CsrMatrix *temp  = new CsrMatrix;
1282       CsrMatrix *tempT = new CsrMatrix;
1283       /* First convert HYB to CSR */
1284       temp->num_rows = A->rmap->n;
1285       temp->num_cols = A->cmap->n;
1286       temp->num_entries = a->nz;
1287       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1288       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1289       temp->values = new THRUSTARRAY(a->nz);
1290 
1291       stat = cusparse_hyb2csr(cusparsestruct->handle,
1292                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1293                               temp->values->data().get(),
1294                               temp->row_offsets->data().get(),
1295                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1296 
1297       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1298       tempT->num_rows = A->rmap->n;
1299       tempT->num_cols = A->cmap->n;
1300       tempT->num_entries = a->nz;
1301       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1302       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1303       tempT->values = new THRUSTARRAY(a->nz);
1304 
1305       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1306                               temp->num_cols, temp->num_entries,
1307                               temp->values->data().get(),
1308                               temp->row_offsets->data().get(),
1309                               temp->column_indices->data().get(),
1310                               tempT->values->data().get(),
1311                               tempT->column_indices->data().get(),
1312                               tempT->row_offsets->data().get(),
1313                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1314 
1315       /* Last, convert CSC to HYB */
1316       cusparseHybMat_t hybMat;
1317       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1321                               matstructT->descr, tempT->values->data().get(),
1322                               tempT->row_offsets->data().get(),
1323                               tempT->column_indices->data().get(),
1324                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1325 
1326       /* assign the pointer */
1327       matstructT->mat = hybMat;
1328       A->transupdated = PETSC_TRUE;
1329       /* delete temporaries */
1330       if (tempT) {
1331         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1332         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1333         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1334         delete (CsrMatrix*) tempT;
1335       }
1336       if (temp) {
1337         if (temp->values) delete (THRUSTARRAY*) temp->values;
1338         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1339         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1340         delete (CsrMatrix*) temp;
1341       }
1342      #endif
1343     }
1344   }
1345   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1346     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1347     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1348     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1349     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1350     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1351     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1352     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1353     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1354     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1355     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1356     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1357       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1358       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1359       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1360     }
1361     if (!cusparsestruct->csr2csc_i) {
1362       THRUSTARRAY csr2csc_a(matrix->num_entries);
1363       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1364 
1365       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1366      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1367       void   *csr2cscBuffer;
1368       size_t csr2cscBufferSize;
1369       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1370                                            A->cmap->n, matrix->num_entries,
1371                                            matrix->values->data().get(),
1372                                            cusparsestruct->rowoffsets_gpu->data().get(),
1373                                            matrix->column_indices->data().get(),
1374                                            matrixT->values->data().get(),
1375                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1376                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1377                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1378       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1379      #endif
1380 
1381       if (matrix->num_entries) {
1382         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1383            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1384            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1385 
1386            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1387            should be filled with indexBase. So I just take a shortcut here.
1388         */
1389         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1390                               A->cmap->n,matrix->num_entries,
1391                               csr2csc_a.data().get(),
1392                               cusparsestruct->rowoffsets_gpu->data().get(),
1393                               matrix->column_indices->data().get(),
1394                               matrixT->values->data().get(),
1395                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1396                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1397                               CUSPARSE_ACTION_NUMERIC,indexBase,
1398                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1399                              #else
1400                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1401                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1402                              #endif
1403       } else {
1404         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1405       }
1406 
1407       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1408       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1409      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1410       PetscCallCUDA(cudaFree(csr2cscBuffer));
1411      #endif
1412     }
1413     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1414                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1415                                                      matrixT->values->begin()));
1416   }
1417   PetscCall(PetscLogGpuTimeEnd());
1418   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1419   /* the compressed row indices is not used for matTranspose */
1420   matstructT->cprowIndices = NULL;
1421   /* assign the pointer */
1422   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1423   A->transupdated = PETSC_TRUE;
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1428 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1429 {
1430   PetscInt                              n = xx->map->n;
1431   const PetscScalar                     *barray;
1432   PetscScalar                           *xarray;
1433   thrust::device_ptr<const PetscScalar> bGPU;
1434   thrust::device_ptr<PetscScalar>       xGPU;
1435   cusparseStatus_t                      stat;
1436   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1437   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1438   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1439   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1440 
1441   PetscFunctionBegin;
1442   /* Analyze the matrix and create the transpose ... on the fly */
1443   if (!loTriFactorT && !upTriFactorT) {
1444     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1445     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1446     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1447   }
1448 
1449   /* Get the GPU pointers */
1450   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1451   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1452   xGPU = thrust::device_pointer_cast(xarray);
1453   bGPU = thrust::device_pointer_cast(barray);
1454 
1455   PetscCall(PetscLogGpuTimeBegin());
1456   /* First, reorder with the row permutation */
1457   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1458                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1459                xGPU);
1460 
1461   /* First, solve U */
1462   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1463                         upTriFactorT->csrMat->num_rows,
1464                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1465                         upTriFactorT->csrMat->num_entries,
1466                       #endif
1467                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1468                         upTriFactorT->csrMat->values->data().get(),
1469                         upTriFactorT->csrMat->row_offsets->data().get(),
1470                         upTriFactorT->csrMat->column_indices->data().get(),
1471                         upTriFactorT->solveInfo,
1472                         xarray,
1473                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1474                         tempGPU->data().get(),
1475                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1476                       #else
1477                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1478                       #endif
1479 
1480   /* Then, solve L */
1481   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1482                         loTriFactorT->csrMat->num_rows,
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         loTriFactorT->csrMat->num_entries,
1485                       #endif
1486                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1487                         loTriFactorT->csrMat->values->data().get(),
1488                         loTriFactorT->csrMat->row_offsets->data().get(),
1489                         loTriFactorT->csrMat->column_indices->data().get(),
1490                         loTriFactorT->solveInfo,
1491                         tempGPU->data().get(),
1492                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1493                         xarray,
1494                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1495                       #else
1496                          xarray);PetscCallCUSPARSE(stat);
1497                       #endif
1498 
1499   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1500   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1501                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1502                tempGPU->begin());
1503 
1504   /* Copy the temporary to the full solution. */
1505   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1506 
1507   /* restore */
1508   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1509   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1510   PetscCall(PetscLogGpuTimeEnd());
1511   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1516 {
1517   const PetscScalar                 *barray;
1518   PetscScalar                       *xarray;
1519   cusparseStatus_t                  stat;
1520   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1521   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1522   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1523   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1524 
1525   PetscFunctionBegin;
1526   /* Analyze the matrix and create the transpose ... on the fly */
1527   if (!loTriFactorT && !upTriFactorT) {
1528     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1529     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1530     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1531   }
1532 
1533   /* Get the GPU pointers */
1534   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1535   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1536 
1537   PetscCall(PetscLogGpuTimeBegin());
1538   /* First, solve U */
1539   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1540                         upTriFactorT->csrMat->num_rows,
1541                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1542                         upTriFactorT->csrMat->num_entries,
1543                       #endif
1544                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1545                         upTriFactorT->csrMat->values->data().get(),
1546                         upTriFactorT->csrMat->row_offsets->data().get(),
1547                         upTriFactorT->csrMat->column_indices->data().get(),
1548                         upTriFactorT->solveInfo,
1549                         barray,
1550                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1551                         tempGPU->data().get(),
1552                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1553                       #else
1554                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1555                       #endif
1556 
1557   /* Then, solve L */
1558   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1559                         loTriFactorT->csrMat->num_rows,
1560                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1561                         loTriFactorT->csrMat->num_entries,
1562                       #endif
1563                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1564                         loTriFactorT->csrMat->values->data().get(),
1565                         loTriFactorT->csrMat->row_offsets->data().get(),
1566                         loTriFactorT->csrMat->column_indices->data().get(),
1567                         loTriFactorT->solveInfo,
1568                         tempGPU->data().get(),
1569                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1570                         xarray,
1571                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1572                       #else
1573                         xarray);PetscCallCUSPARSE(stat);
1574                       #endif
1575 
1576   /* restore */
1577   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1578   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1579   PetscCall(PetscLogGpuTimeEnd());
1580   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1585 {
1586   const PetscScalar                     *barray;
1587   PetscScalar                           *xarray;
1588   thrust::device_ptr<const PetscScalar> bGPU;
1589   thrust::device_ptr<PetscScalar>       xGPU;
1590   cusparseStatus_t                      stat;
1591   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595 
1596   PetscFunctionBegin;
1597 
1598   /* Get the GPU pointers */
1599   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1600   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1601   xGPU = thrust::device_pointer_cast(xarray);
1602   bGPU = thrust::device_pointer_cast(barray);
1603 
1604   PetscCall(PetscLogGpuTimeBegin());
1605   /* First, reorder with the row permutation */
1606   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1607                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1608                tempGPU->begin());
1609 
1610   /* Next, solve L */
1611   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1612                         loTriFactor->csrMat->num_rows,
1613                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614                         loTriFactor->csrMat->num_entries,
1615                       #endif
1616                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1617                         loTriFactor->csrMat->values->data().get(),
1618                         loTriFactor->csrMat->row_offsets->data().get(),
1619                         loTriFactor->csrMat->column_indices->data().get(),
1620                         loTriFactor->solveInfo,
1621                         tempGPU->data().get(),
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                          xarray,
1624                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1625                       #else
1626                          xarray);PetscCallCUSPARSE(stat);
1627                       #endif
1628 
1629   /* Then, solve U */
1630   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1631                         upTriFactor->csrMat->num_rows,
1632                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633                         upTriFactor->csrMat->num_entries,
1634                       #endif
1635                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1636                         upTriFactor->csrMat->values->data().get(),
1637                         upTriFactor->csrMat->row_offsets->data().get(),
1638                         upTriFactor->csrMat->column_indices->data().get(),
1639                         upTriFactor->solveInfo,xarray,
1640                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1641                         tempGPU->data().get(),
1642                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1643                       #else
1644                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1645                       #endif
1646 
1647   /* Last, reorder with the column permutation */
1648   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1649                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1650                xGPU);
1651 
1652   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1653   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1660 {
1661   const PetscScalar                 *barray;
1662   PetscScalar                       *xarray;
1663   cusparseStatus_t                  stat;
1664   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1665   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1667   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1668 
1669   PetscFunctionBegin;
1670   /* Get the GPU pointers */
1671   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1672   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1673 
1674   PetscCall(PetscLogGpuTimeBegin());
1675   /* First, solve L */
1676   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1677                         loTriFactor->csrMat->num_rows,
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         loTriFactor->csrMat->num_entries,
1680                       #endif
1681                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1682                         loTriFactor->csrMat->values->data().get(),
1683                         loTriFactor->csrMat->row_offsets->data().get(),
1684                         loTriFactor->csrMat->column_indices->data().get(),
1685                         loTriFactor->solveInfo,
1686                         barray,
1687                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1688                         tempGPU->data().get(),
1689                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1690                       #else
1691                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1692                       #endif
1693 
1694   /* Next, solve U */
1695   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1696                         upTriFactor->csrMat->num_rows,
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         upTriFactor->csrMat->num_entries,
1699                       #endif
1700                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1701                         upTriFactor->csrMat->values->data().get(),
1702                         upTriFactor->csrMat->row_offsets->data().get(),
1703                         upTriFactor->csrMat->column_indices->data().get(),
1704                         upTriFactor->solveInfo,
1705                         tempGPU->data().get(),
1706                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1707                         xarray,
1708                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1709                       #else
1710                         xarray);PetscCallCUSPARSE(stat);
1711                       #endif
1712 
1713   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1714   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1715   PetscCall(PetscLogGpuTimeEnd());
1716   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1721 {
1722   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1723   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1724 
1725   PetscFunctionBegin;
1726   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1727     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1728 
1729     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1730     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1731     PetscCallCUDA(WaitForCUDA());
1732     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1733     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1734     A->offloadmask = PETSC_OFFLOAD_BOTH;
1735   }
1736   PetscFunctionReturn(0);
1737 }
1738 
1739 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1740 {
1741   PetscFunctionBegin;
1742   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1743   *array = ((Mat_SeqAIJ*)A->data)->a;
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1748 {
1749   PetscFunctionBegin;
1750   A->offloadmask = PETSC_OFFLOAD_CPU;
1751   *array         = NULL;
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1756 {
1757   PetscFunctionBegin;
1758   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1759   *array = ((Mat_SeqAIJ*)A->data)->a;
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1764 {
1765   PetscFunctionBegin;
1766   *array = NULL;
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1771 {
1772   PetscFunctionBegin;
1773   *array = ((Mat_SeqAIJ*)A->data)->a;
1774   PetscFunctionReturn(0);
1775 }
1776 
1777 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1778 {
1779   PetscFunctionBegin;
1780   A->offloadmask = PETSC_OFFLOAD_CPU;
1781   *array         = NULL;
1782   PetscFunctionReturn(0);
1783 }
1784 
1785 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1786 {
1787   Mat_SeqAIJCUSPARSE           *cusp;
1788   CsrMatrix                    *matrix;
1789 
1790   PetscFunctionBegin;
1791   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1792   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1793   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1794   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
1795   matrix = (CsrMatrix*)cusp->mat->mat;
1796 
1797   if (i) {
1798    #if !defined(PETSC_USE_64BIT_INDICES)
1799     *i = matrix->row_offsets->data().get();
1800    #else
1801     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1802    #endif
1803   }
1804   if (j) {
1805    #if !defined(PETSC_USE_64BIT_INDICES)
1806     *j = matrix->column_indices->data().get();
1807    #else
1808     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1809    #endif
1810   }
1811   if (a) *a = matrix->values->data().get();
1812   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1817 {
1818   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1819   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1820   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1821   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1822   cusparseStatus_t             stat;
1823   PetscBool                    both = PETSC_TRUE;
1824 
1825   PetscFunctionBegin;
1826   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1827   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1828     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1829       CsrMatrix *matrix;
1830       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1831 
1832       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1833       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1834       matrix->values->assign(a->a, a->a+a->nz);
1835       PetscCallCUDA(WaitForCUDA());
1836       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1837       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1838       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1839     } else {
1840       PetscInt nnz;
1841       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1842       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1843       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1844       delete cusparsestruct->workVector;
1845       delete cusparsestruct->rowoffsets_gpu;
1846       cusparsestruct->workVector = NULL;
1847       cusparsestruct->rowoffsets_gpu = NULL;
1848       try {
1849         if (a->compressedrow.use) {
1850           m    = a->compressedrow.nrows;
1851           ii   = a->compressedrow.i;
1852           ridx = a->compressedrow.rindex;
1853         } else {
1854           m    = A->rmap->n;
1855           ii   = a->i;
1856           ridx = NULL;
1857         }
1858         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1859         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1860         else nnz = a->nz;
1861         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1862 
1863         /* create cusparse matrix */
1864         cusparsestruct->nrows = m;
1865         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1866         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1867         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1868         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1869 
1870         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1871         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1872         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1873         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1874         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1875         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1876         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1877 
1878         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1879         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1880           /* set the matrix */
1881           CsrMatrix *mat= new CsrMatrix;
1882           mat->num_rows = m;
1883           mat->num_cols = A->cmap->n;
1884           mat->num_entries = nnz;
1885           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1886           mat->row_offsets->assign(ii, ii + m+1);
1887 
1888           mat->column_indices = new THRUSTINTARRAY32(nnz);
1889           mat->column_indices->assign(a->j, a->j+nnz);
1890 
1891           mat->values = new THRUSTARRAY(nnz);
1892           if (a->a) mat->values->assign(a->a, a->a+nnz);
1893 
1894           /* assign the pointer */
1895           matstruct->mat = mat;
1896          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1897           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1898             stat = cusparseCreateCsr(&matstruct->matDescr,
1899                                     mat->num_rows, mat->num_cols, mat->num_entries,
1900                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1901                                     mat->values->data().get(),
1902                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1903                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1904           }
1905          #endif
1906         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1907          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1908           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1909          #else
1910           CsrMatrix *mat= new CsrMatrix;
1911           mat->num_rows = m;
1912           mat->num_cols = A->cmap->n;
1913           mat->num_entries = nnz;
1914           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1915           mat->row_offsets->assign(ii, ii + m+1);
1916 
1917           mat->column_indices = new THRUSTINTARRAY32(nnz);
1918           mat->column_indices->assign(a->j, a->j+nnz);
1919 
1920           mat->values = new THRUSTARRAY(nnz);
1921           if (a->a) mat->values->assign(a->a, a->a+nnz);
1922 
1923           cusparseHybMat_t hybMat;
1924           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1925           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1926             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1927           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1928               matstruct->descr, mat->values->data().get(),
1929               mat->row_offsets->data().get(),
1930               mat->column_indices->data().get(),
1931               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1932           /* assign the pointer */
1933           matstruct->mat = hybMat;
1934 
1935           if (mat) {
1936             if (mat->values) delete (THRUSTARRAY*)mat->values;
1937             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1938             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1939             delete (CsrMatrix*)mat;
1940           }
1941          #endif
1942         }
1943 
1944         /* assign the compressed row indices */
1945         if (a->compressedrow.use) {
1946           cusparsestruct->workVector = new THRUSTARRAY(m);
1947           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1948           matstruct->cprowIndices->assign(ridx,ridx+m);
1949           tmp = m;
1950         } else {
1951           cusparsestruct->workVector = NULL;
1952           matstruct->cprowIndices    = NULL;
1953           tmp = 0;
1954         }
1955         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1956 
1957         /* assign the pointer */
1958         cusparsestruct->mat = matstruct;
1959       } catch(char *ex) {
1960         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1961       }
1962       PetscCallCUDA(WaitForCUDA());
1963       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1964       cusparsestruct->nonzerostate = A->nonzerostate;
1965     }
1966     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1967   }
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 struct VecCUDAPlusEquals
1972 {
1973   template <typename Tuple>
1974   __host__ __device__
1975   void operator()(Tuple t)
1976   {
1977     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1978   }
1979 };
1980 
1981 struct VecCUDAEquals
1982 {
1983   template <typename Tuple>
1984   __host__ __device__
1985   void operator()(Tuple t)
1986   {
1987     thrust::get<1>(t) = thrust::get<0>(t);
1988   }
1989 };
1990 
1991 struct VecCUDAEqualsReverse
1992 {
1993   template <typename Tuple>
1994   __host__ __device__
1995   void operator()(Tuple t)
1996   {
1997     thrust::get<0>(t) = thrust::get<1>(t);
1998   }
1999 };
2000 
2001 struct MatMatCusparse {
2002   PetscBool             cisdense;
2003   PetscScalar           *Bt;
2004   Mat                   X;
2005   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2006   PetscLogDouble        flops;
2007   CsrMatrix             *Bcsr;
2008 
2009 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2010   cusparseSpMatDescr_t  matSpBDescr;
2011   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2012   cusparseDnMatDescr_t  matBDescr;
2013   cusparseDnMatDescr_t  matCDescr;
2014   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2015  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2016   void                  *dBuffer4;
2017   void                  *dBuffer5;
2018  #endif
2019   size_t                mmBufferSize;
2020   void                  *mmBuffer;
2021   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2022   cusparseSpGEMMDescr_t spgemmDesc;
2023 #endif
2024 };
2025 
2026 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2027 {
2028   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2029 
2030   PetscFunctionBegin;
2031   PetscCallCUDA(cudaFree(mmdata->Bt));
2032   delete mmdata->Bcsr;
2033  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2034   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2035   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2036   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2037   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2038  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2039   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2040   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2041  #endif
2042   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2043   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2044  #endif
2045   PetscCall(MatDestroy(&mmdata->X));
2046   PetscCall(PetscFree(data));
2047   PetscFunctionReturn(0);
2048 }
2049 
2050 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2051 
2052 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2053 {
2054   Mat_Product                  *product = C->product;
2055   Mat                          A,B;
2056   PetscInt                     m,n,blda,clda;
2057   PetscBool                    flg,biscuda;
2058   Mat_SeqAIJCUSPARSE           *cusp;
2059   cusparseStatus_t             stat;
2060   cusparseOperation_t          opA;
2061   const PetscScalar            *barray;
2062   PetscScalar                  *carray;
2063   MatMatCusparse               *mmdata;
2064   Mat_SeqAIJCUSPARSEMultStruct *mat;
2065   CsrMatrix                    *csrmat;
2066 
2067   PetscFunctionBegin;
2068   MatCheckProduct(C,1);
2069   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2070   mmdata = (MatMatCusparse*)product->data;
2071   A    = product->A;
2072   B    = product->B;
2073   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2074   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2075   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2076      Instead of silently accepting the wrong answer, I prefer to raise the error */
2077   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2078   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2079   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080   switch (product->type) {
2081   case MATPRODUCT_AB:
2082   case MATPRODUCT_PtAP:
2083     mat = cusp->mat;
2084     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2085     m   = A->rmap->n;
2086     n   = B->cmap->n;
2087     break;
2088   case MATPRODUCT_AtB:
2089     if (!A->form_explicit_transpose) {
2090       mat = cusp->mat;
2091       opA = CUSPARSE_OPERATION_TRANSPOSE;
2092     } else {
2093       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2094       mat  = cusp->matTranspose;
2095       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2096     }
2097     m = A->cmap->n;
2098     n = B->cmap->n;
2099     break;
2100   case MATPRODUCT_ABt:
2101   case MATPRODUCT_RARt:
2102     mat = cusp->mat;
2103     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2104     m   = A->rmap->n;
2105     n   = B->rmap->n;
2106     break;
2107   default:
2108     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2109   }
2110   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2111   csrmat = (CsrMatrix*)mat->mat;
2112   /* if the user passed a CPU matrix, copy the data to the GPU */
2113   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2114   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2115   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2116 
2117   PetscCall(MatDenseGetLDA(B,&blda));
2118   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2119     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2120     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2121   } else {
2122     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2123     PetscCall(MatDenseGetLDA(C,&clda));
2124   }
2125 
2126   PetscCall(PetscLogGpuTimeBegin());
2127  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2129   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2130   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2131     size_t mmBufferSize;
2132     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2133     if (!mmdata->matBDescr) {
2134       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2135       mmdata->Blda = blda;
2136     }
2137 
2138     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2139     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2140       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2141       mmdata->Clda = clda;
2142     }
2143 
2144     if (!mat->matDescr) {
2145       stat = cusparseCreateCsr(&mat->matDescr,
2146                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2147                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2148                                csrmat->values->data().get(),
2149                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2150                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2151     }
2152     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2153                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2154                                    mmdata->matCDescr,cusparse_scalartype,
2155                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2156     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2157       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2158       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2159       mmdata->mmBufferSize = mmBufferSize;
2160     }
2161     mmdata->initialized = PETSC_TRUE;
2162   } else {
2163     /* to be safe, always update pointers of the mats */
2164     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2165     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2166     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2167   }
2168 
2169   /* do cusparseSpMM, which supports transpose on B */
2170   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2171                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2172                       mmdata->matCDescr,cusparse_scalartype,
2173                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2174  #else
2175   PetscInt k;
2176   /* cusparseXcsrmm does not support transpose on B */
2177   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2178     cublasHandle_t cublasv2handle;
2179     cublasStatus_t cerr;
2180 
2181     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2182     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2183                        B->cmap->n,B->rmap->n,
2184                        &PETSC_CUSPARSE_ONE ,barray,blda,
2185                        &PETSC_CUSPARSE_ZERO,barray,blda,
2186                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2187     blda = B->cmap->n;
2188     k    = B->cmap->n;
2189   } else {
2190     k    = B->rmap->n;
2191   }
2192 
2193   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2194   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2195                            csrmat->num_entries,mat->alpha_one,mat->descr,
2196                            csrmat->values->data().get(),
2197                            csrmat->row_offsets->data().get(),
2198                            csrmat->column_indices->data().get(),
2199                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2200                            carray,clda);PetscCallCUSPARSE(stat);
2201  #endif
2202   PetscCall(PetscLogGpuTimeEnd());
2203   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2204   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2205   if (product->type == MATPRODUCT_RARt) {
2206     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2207     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2208   } else if (product->type == MATPRODUCT_PtAP) {
2209     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2210     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2211   } else {
2212     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2213   }
2214   if (mmdata->cisdense) {
2215     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2216   }
2217   if (!biscuda) {
2218     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2219   }
2220   PetscFunctionReturn(0);
2221 }
2222 
2223 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2224 {
2225   Mat_Product        *product = C->product;
2226   Mat                A,B;
2227   PetscInt           m,n;
2228   PetscBool          cisdense,flg;
2229   MatMatCusparse     *mmdata;
2230   Mat_SeqAIJCUSPARSE *cusp;
2231 
2232   PetscFunctionBegin;
2233   MatCheckProduct(C,1);
2234   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2235   A    = product->A;
2236   B    = product->B;
2237   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2238   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2239   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2240   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2241   switch (product->type) {
2242   case MATPRODUCT_AB:
2243     m = A->rmap->n;
2244     n = B->cmap->n;
2245     break;
2246   case MATPRODUCT_AtB:
2247     m = A->cmap->n;
2248     n = B->cmap->n;
2249     break;
2250   case MATPRODUCT_ABt:
2251     m = A->rmap->n;
2252     n = B->rmap->n;
2253     break;
2254   case MATPRODUCT_PtAP:
2255     m = B->cmap->n;
2256     n = B->cmap->n;
2257     break;
2258   case MATPRODUCT_RARt:
2259     m = B->rmap->n;
2260     n = B->rmap->n;
2261     break;
2262   default:
2263     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2264   }
2265   PetscCall(MatSetSizes(C,m,n,m,n));
2266   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2267   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2268   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2269 
2270   /* product data */
2271   PetscCall(PetscNew(&mmdata));
2272   mmdata->cisdense = cisdense;
2273  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2274   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2275   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2276     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2277   }
2278  #endif
2279   /* for these products we need intermediate storage */
2280   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2281     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2282     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2283     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2284       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2285     } else {
2286       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2287     }
2288   }
2289   C->product->data    = mmdata;
2290   C->product->destroy = MatDestroy_MatMatCusparse;
2291 
2292   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2293   PetscFunctionReturn(0);
2294 }
2295 
2296 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2297 {
2298   Mat_Product                  *product = C->product;
2299   Mat                          A,B;
2300   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2301   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2302   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2303   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2304   PetscBool                    flg;
2305   cusparseStatus_t             stat;
2306   MatProductType               ptype;
2307   MatMatCusparse               *mmdata;
2308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2309   cusparseSpMatDescr_t         BmatSpDescr;
2310 #endif
2311   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2312 
2313   PetscFunctionBegin;
2314   MatCheckProduct(C,1);
2315   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2316   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2317   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2318   mmdata = (MatMatCusparse*)C->product->data;
2319   A = product->A;
2320   B = product->B;
2321   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2322     mmdata->reusesym = PETSC_FALSE;
2323     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2324     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2325     Cmat = Ccusp->mat;
2326     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2327     Ccsr = (CsrMatrix*)Cmat->mat;
2328     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2329     goto finalize;
2330   }
2331   if (!c->nz) goto finalize;
2332   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2333   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2334   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2335   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2336   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2337   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2338   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2339   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2340   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2341   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2342   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2343   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2344   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2345   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2346 
2347   ptype = product->type;
2348   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2349     ptype = MATPRODUCT_AB;
2350     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2351   }
2352   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2353     ptype = MATPRODUCT_AB;
2354     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2355   }
2356   switch (ptype) {
2357   case MATPRODUCT_AB:
2358     Amat = Acusp->mat;
2359     Bmat = Bcusp->mat;
2360     break;
2361   case MATPRODUCT_AtB:
2362     Amat = Acusp->matTranspose;
2363     Bmat = Bcusp->mat;
2364     break;
2365   case MATPRODUCT_ABt:
2366     Amat = Acusp->mat;
2367     Bmat = Bcusp->matTranspose;
2368     break;
2369   default:
2370     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2371   }
2372   Cmat = Ccusp->mat;
2373   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2374   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2375   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2376   Acsr = (CsrMatrix*)Amat->mat;
2377   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2378   Ccsr = (CsrMatrix*)Cmat->mat;
2379   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2380   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2381   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2382   PetscCall(PetscLogGpuTimeBegin());
2383 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2384   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2385   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2386   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2387     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2388                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2389                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2390                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2391   #else
2392     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2393                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2394                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2395                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2396     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2397                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2398                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2399   #endif
2400 #else
2401   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2402                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2403                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2404                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2405                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2406 #endif
2407   PetscCall(PetscLogGpuFlops(mmdata->flops));
2408   PetscCallCUDA(WaitForCUDA());
2409   PetscCall(PetscLogGpuTimeEnd());
2410   C->offloadmask = PETSC_OFFLOAD_GPU;
2411 finalize:
2412   /* shorter version of MatAssemblyEnd_SeqAIJ */
2413   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2414   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2415   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2416   c->reallocs         = 0;
2417   C->info.mallocs    += 0;
2418   C->info.nz_unneeded = 0;
2419   C->assembled = C->was_assembled = PETSC_TRUE;
2420   C->num_ass++;
2421   PetscFunctionReturn(0);
2422 }
2423 
2424 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2425 {
2426   Mat_Product                  *product = C->product;
2427   Mat                          A,B;
2428   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2429   Mat_SeqAIJ                   *a,*b,*c;
2430   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2431   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2432   PetscInt                     i,j,m,n,k;
2433   PetscBool                    flg;
2434   cusparseStatus_t             stat;
2435   MatProductType               ptype;
2436   MatMatCusparse               *mmdata;
2437   PetscLogDouble               flops;
2438   PetscBool                    biscompressed,ciscompressed;
2439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2440   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2441   cusparseSpMatDescr_t         BmatSpDescr;
2442 #else
2443   int                          cnz;
2444 #endif
2445   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2446 
2447   PetscFunctionBegin;
2448   MatCheckProduct(C,1);
2449   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2450   A    = product->A;
2451   B    = product->B;
2452   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2453   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2454   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2455   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2456   a = (Mat_SeqAIJ*)A->data;
2457   b = (Mat_SeqAIJ*)B->data;
2458   /* product data */
2459   PetscCall(PetscNew(&mmdata));
2460   C->product->data    = mmdata;
2461   C->product->destroy = MatDestroy_MatMatCusparse;
2462 
2463   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2464   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2465   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2466   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2467   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2468   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2469 
2470   ptype = product->type;
2471   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2472     ptype = MATPRODUCT_AB;
2473     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2474   }
2475   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2476     ptype = MATPRODUCT_AB;
2477     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2478   }
2479   biscompressed = PETSC_FALSE;
2480   ciscompressed = PETSC_FALSE;
2481   switch (ptype) {
2482   case MATPRODUCT_AB:
2483     m = A->rmap->n;
2484     n = B->cmap->n;
2485     k = A->cmap->n;
2486     Amat = Acusp->mat;
2487     Bmat = Bcusp->mat;
2488     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2489     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2490     break;
2491   case MATPRODUCT_AtB:
2492     m = A->cmap->n;
2493     n = B->cmap->n;
2494     k = A->rmap->n;
2495     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2496     Amat = Acusp->matTranspose;
2497     Bmat = Bcusp->mat;
2498     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499     break;
2500   case MATPRODUCT_ABt:
2501     m = A->rmap->n;
2502     n = B->rmap->n;
2503     k = A->cmap->n;
2504     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2505     Amat = Acusp->mat;
2506     Bmat = Bcusp->matTranspose;
2507     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2508     break;
2509   default:
2510     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2511   }
2512 
2513   /* create cusparse matrix */
2514   PetscCall(MatSetSizes(C,m,n,m,n));
2515   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2516   c     = (Mat_SeqAIJ*)C->data;
2517   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2518   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2519   Ccsr  = new CsrMatrix;
2520 
2521   c->compressedrow.use = ciscompressed;
2522   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2523     c->compressedrow.nrows = a->compressedrow.nrows;
2524     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2525     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2526     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2527     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2528     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2529   } else {
2530     c->compressedrow.nrows  = 0;
2531     c->compressedrow.i      = NULL;
2532     c->compressedrow.rindex = NULL;
2533     Ccusp->workVector       = NULL;
2534     Cmat->cprowIndices      = NULL;
2535   }
2536   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2537   Ccusp->mat      = Cmat;
2538   Ccusp->mat->mat = Ccsr;
2539   Ccsr->num_rows    = Ccusp->nrows;
2540   Ccsr->num_cols    = n;
2541   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2542   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2543   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2544   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2545   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2546   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2547   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2548   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2549   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2550   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2551   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2552     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2553     c->nz = 0;
2554     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2555     Ccsr->values = new THRUSTARRAY(c->nz);
2556     goto finalizesym;
2557   }
2558 
2559   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2560   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2561   Acsr = (CsrMatrix*)Amat->mat;
2562   if (!biscompressed) {
2563     Bcsr = (CsrMatrix*)Bmat->mat;
2564 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2565     BmatSpDescr = Bmat->matDescr;
2566 #endif
2567   } else { /* we need to use row offsets for the full matrix */
2568     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2569     Bcsr = new CsrMatrix;
2570     Bcsr->num_rows       = B->rmap->n;
2571     Bcsr->num_cols       = cBcsr->num_cols;
2572     Bcsr->num_entries    = cBcsr->num_entries;
2573     Bcsr->column_indices = cBcsr->column_indices;
2574     Bcsr->values         = cBcsr->values;
2575     if (!Bcusp->rowoffsets_gpu) {
2576       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2577       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2578       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2579     }
2580     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2581     mmdata->Bcsr = Bcsr;
2582 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2583     if (Bcsr->num_rows && Bcsr->num_cols) {
2584       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2585                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2586                                Bcsr->values->data().get(),
2587                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2588                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2589     }
2590     BmatSpDescr = mmdata->matSpBDescr;
2591 #endif
2592   }
2593   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2594   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2595   /* precompute flops count */
2596   if (ptype == MATPRODUCT_AB) {
2597     for (i=0, flops = 0; i<A->rmap->n; i++) {
2598       const PetscInt st = a->i[i];
2599       const PetscInt en = a->i[i+1];
2600       for (j=st; j<en; j++) {
2601         const PetscInt brow = a->j[j];
2602         flops += 2.*(b->i[brow+1] - b->i[brow]);
2603       }
2604     }
2605   } else if (ptype == MATPRODUCT_AtB) {
2606     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607       const PetscInt anzi = a->i[i+1] - a->i[i];
2608       const PetscInt bnzi = b->i[i+1] - b->i[i];
2609       flops += (2.*anzi)*bnzi;
2610     }
2611   } else { /* TODO */
2612     flops = 0.;
2613   }
2614 
2615   mmdata->flops = flops;
2616   PetscCall(PetscLogGpuTimeBegin());
2617 
2618 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2619   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2620   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2621                           NULL, NULL, NULL,
2622                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2623                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2624   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2625  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2626  {
2627   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2628      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2629   */
2630   void*  dBuffer1 = NULL;
2631   void*  dBuffer2 = NULL;
2632   void*  dBuffer3 = NULL;
2633   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2634   size_t bufferSize1 = 0;
2635   size_t bufferSize2 = 0;
2636   size_t bufferSize3 = 0;
2637   size_t bufferSize4 = 0;
2638   size_t bufferSize5 = 0;
2639 
2640   /*----------------------------------------------------------------------*/
2641   /* ask bufferSize1 bytes for external memory */
2642   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2645   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2646   /* inspect the matrices A and B to understand the memory requirement for the next step */
2647   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2648                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2649                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2650 
2651   /*----------------------------------------------------------------------*/
2652   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2653                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2654                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2655   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2656   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2657   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2658   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2659                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2660                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2661   PetscCallCUDA(cudaFree(dBuffer1));
2662   PetscCallCUDA(cudaFree(dBuffer2));
2663 
2664   /*----------------------------------------------------------------------*/
2665   /* get matrix C non-zero entries C_nnz1 */
2666   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2667   c->nz = (PetscInt) C_nnz1;
2668   /* allocate matrix C */
2669   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2670   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2671   /* update matC with the new pointers */
2672   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2673                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2674 
2675   /*----------------------------------------------------------------------*/
2676   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2677                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2678                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2679   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2680   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2681                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2682                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2683   PetscCallCUDA(cudaFree(dBuffer3));
2684   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2685                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2686                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2687                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2688   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2689  }
2690  #else
2691   size_t bufSize2;
2692   /* ask bufferSize bytes for external memory */
2693   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2694                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2696                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2697   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2698   /* inspect the matrices A and B to understand the memory requirement for the next step */
2699   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2700                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2701                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2702                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2703   /* ask bufferSize again bytes for external memory */
2704   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2705                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2706                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2707                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2708   /* The CUSPARSE documentation is not clear, nor the API
2709      We need both buffers to perform the operations properly!
2710      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2711      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2712      is stored in the descriptor! What a messy API... */
2713   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2714   /* compute the intermediate product of A * B */
2715   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2716                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2717                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2718                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2719   /* get matrix C non-zero entries C_nnz1 */
2720   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2721   c->nz = (PetscInt) C_nnz1;
2722   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2723   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2724   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2725   Ccsr->values = new THRUSTARRAY(c->nz);
2726   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2727   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2728                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2729   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2730                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2731                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2732  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2733 #else
2734   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2735   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2736                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2737                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2738                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2739                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2740   c->nz = cnz;
2741   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2742   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2743   Ccsr->values = new THRUSTARRAY(c->nz);
2744   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2745 
2746   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2747   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2748      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2749      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2750   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2751                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2752                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2753                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2754                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2755 #endif
2756   PetscCall(PetscLogGpuFlops(mmdata->flops));
2757   PetscCall(PetscLogGpuTimeEnd());
2758 finalizesym:
2759   c->singlemalloc = PETSC_FALSE;
2760   c->free_a       = PETSC_TRUE;
2761   c->free_ij      = PETSC_TRUE;
2762   PetscCall(PetscMalloc1(m+1,&c->i));
2763   PetscCall(PetscMalloc1(c->nz,&c->j));
2764   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2765     PetscInt *d_i = c->i;
2766     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2767     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2768     ii   = *Ccsr->row_offsets;
2769     jj   = *Ccsr->column_indices;
2770     if (ciscompressed) d_i = c->compressedrow.i;
2771     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2772     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2773   } else {
2774     PetscInt *d_i = c->i;
2775     if (ciscompressed) d_i = c->compressedrow.i;
2776     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2777     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2778   }
2779   if (ciscompressed) { /* need to expand host row offsets */
2780     PetscInt r = 0;
2781     c->i[0] = 0;
2782     for (k = 0; k < c->compressedrow.nrows; k++) {
2783       const PetscInt next = c->compressedrow.rindex[k];
2784       const PetscInt old = c->compressedrow.i[k];
2785       for (; r < next; r++) c->i[r+1] = old;
2786     }
2787     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2788   }
2789   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2790   PetscCall(PetscMalloc1(m,&c->ilen));
2791   PetscCall(PetscMalloc1(m,&c->imax));
2792   c->maxnz = c->nz;
2793   c->nonzerorowcnt = 0;
2794   c->rmax = 0;
2795   for (k = 0; k < m; k++) {
2796     const PetscInt nn = c->i[k+1] - c->i[k];
2797     c->ilen[k] = c->imax[k] = nn;
2798     c->nonzerorowcnt += (PetscInt)!!nn;
2799     c->rmax = PetscMax(c->rmax,nn);
2800   }
2801   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2802   PetscCall(PetscMalloc1(c->nz,&c->a));
2803   Ccsr->num_entries = c->nz;
2804 
2805   C->nonzerostate++;
2806   PetscCall(PetscLayoutSetUp(C->rmap));
2807   PetscCall(PetscLayoutSetUp(C->cmap));
2808   Ccusp->nonzerostate = C->nonzerostate;
2809   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2810   C->preallocated  = PETSC_TRUE;
2811   C->assembled     = PETSC_FALSE;
2812   C->was_assembled = PETSC_FALSE;
2813   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2814     mmdata->reusesym = PETSC_TRUE;
2815     C->offloadmask   = PETSC_OFFLOAD_GPU;
2816   }
2817   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2818   PetscFunctionReturn(0);
2819 }
2820 
2821 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2822 
2823 /* handles sparse or dense B */
2824 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2825 {
2826   Mat_Product    *product = mat->product;
2827   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2828 
2829   PetscFunctionBegin;
2830   MatCheckProduct(mat,1);
2831   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2832   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2833     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2834   }
2835   if (product->type == MATPRODUCT_ABC) {
2836     Ciscusp = PETSC_FALSE;
2837     if (!product->C->boundtocpu) {
2838       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2839     }
2840   }
2841   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2842     PetscBool usecpu = PETSC_FALSE;
2843     switch (product->type) {
2844     case MATPRODUCT_AB:
2845       if (product->api_user) {
2846         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2847         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2848         PetscOptionsEnd();
2849       } else {
2850         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2851         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2852         PetscOptionsEnd();
2853       }
2854       break;
2855     case MATPRODUCT_AtB:
2856       if (product->api_user) {
2857         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2858         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2859         PetscOptionsEnd();
2860       } else {
2861         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2862         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2863         PetscOptionsEnd();
2864       }
2865       break;
2866     case MATPRODUCT_PtAP:
2867       if (product->api_user) {
2868         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2869         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2870         PetscOptionsEnd();
2871       } else {
2872         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2873         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2874         PetscOptionsEnd();
2875       }
2876       break;
2877     case MATPRODUCT_RARt:
2878       if (product->api_user) {
2879         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2880         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2881         PetscOptionsEnd();
2882       } else {
2883         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2884         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2885         PetscOptionsEnd();
2886       }
2887       break;
2888     case MATPRODUCT_ABC:
2889       if (product->api_user) {
2890         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2891         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2892         PetscOptionsEnd();
2893       } else {
2894         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2895         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2896         PetscOptionsEnd();
2897       }
2898       break;
2899     default:
2900       break;
2901     }
2902     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2903   }
2904   /* dispatch */
2905   if (isdense) {
2906     switch (product->type) {
2907     case MATPRODUCT_AB:
2908     case MATPRODUCT_AtB:
2909     case MATPRODUCT_ABt:
2910     case MATPRODUCT_PtAP:
2911     case MATPRODUCT_RARt:
2912      if (product->A->boundtocpu) {
2913         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2914       } else {
2915         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2916       }
2917       break;
2918     case MATPRODUCT_ABC:
2919       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2920       break;
2921     default:
2922       break;
2923     }
2924   } else if (Biscusp && Ciscusp) {
2925     switch (product->type) {
2926     case MATPRODUCT_AB:
2927     case MATPRODUCT_AtB:
2928     case MATPRODUCT_ABt:
2929       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2930       break;
2931     case MATPRODUCT_PtAP:
2932     case MATPRODUCT_RARt:
2933     case MATPRODUCT_ABC:
2934       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2935       break;
2936     default:
2937       break;
2938     }
2939   } else { /* fallback for AIJ */
2940     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2941   }
2942   PetscFunctionReturn(0);
2943 }
2944 
2945 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2946 {
2947   PetscFunctionBegin;
2948   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2949   PetscFunctionReturn(0);
2950 }
2951 
2952 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2953 {
2954   PetscFunctionBegin;
2955   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2956   PetscFunctionReturn(0);
2957 }
2958 
2959 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2960 {
2961   PetscFunctionBegin;
2962   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2963   PetscFunctionReturn(0);
2964 }
2965 
2966 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2967 {
2968   PetscFunctionBegin;
2969   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2970   PetscFunctionReturn(0);
2971 }
2972 
2973 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2974 {
2975   PetscFunctionBegin;
2976   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2977   PetscFunctionReturn(0);
2978 }
2979 
2980 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2981 {
2982   int i = blockIdx.x*blockDim.x + threadIdx.x;
2983   if (i < n) y[idx[i]] += x[i];
2984 }
2985 
2986 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2987 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2988 {
2989   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2990   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2991   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2992   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2993   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2994   PetscBool                    compressed;
2995 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2996   PetscInt                     nx,ny;
2997 #endif
2998 
2999   PetscFunctionBegin;
3000   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3001   if (!a->nz) {
3002     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3003     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3004     PetscFunctionReturn(0);
3005   }
3006   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3007   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3008   if (!trans) {
3009     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3010     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3011   } else {
3012     if (herm || !A->form_explicit_transpose) {
3013       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3014       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3015     } else {
3016       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3017       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3018     }
3019   }
3020   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3021   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3022 
3023   try {
3024     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3025     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3026     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3027 
3028     PetscCall(PetscLogGpuTimeBegin());
3029     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3030       /* z = A x + beta y.
3031          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3032          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3033       */
3034       xptr = xarray;
3035       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3036       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3037      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3038       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3039           allocated to accommodate different uses. So we get the length info directly from mat.
3040        */
3041       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043         nx = mat->num_cols;
3044         ny = mat->num_rows;
3045       }
3046      #endif
3047     } else {
3048       /* z = A^T x + beta y
3049          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3050          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3051        */
3052       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3053       dptr = zarray;
3054       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3055       if (compressed) { /* Scatter x to work vector */
3056         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3057         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3058                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3059                          VecCUDAEqualsReverse());
3060       }
3061      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3062       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3063         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3064         nx = mat->num_rows;
3065         ny = mat->num_cols;
3066       }
3067      #endif
3068     }
3069 
3070     /* csr_spmv does y = alpha op(A) x + beta y */
3071     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3072      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3073       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3074       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3075         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3076         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3077         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3078                                                matstruct->matDescr,
3079                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3080                                                matstruct->cuSpMV[opA].vecYDescr,
3081                                                cusparse_scalartype,
3082                                                cusparsestruct->spmvAlg,
3083                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3084         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3085 
3086         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3087       } else {
3088         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3089         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3090         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3091       }
3092 
3093       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3094                                   matstruct->alpha_one,
3095                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3096                                   matstruct->cuSpMV[opA].vecXDescr,
3097                                   beta,
3098                                   matstruct->cuSpMV[opA].vecYDescr,
3099                                   cusparse_scalartype,
3100                                   cusparsestruct->spmvAlg,
3101                                   matstruct->cuSpMV[opA].spmvBuffer));
3102      #else
3103       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3104       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3105                                        mat->num_rows, mat->num_cols,
3106                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3107                                        mat->values->data().get(), mat->row_offsets->data().get(),
3108                                        mat->column_indices->data().get(), xptr, beta,
3109                                        dptr));
3110      #endif
3111     } else {
3112       if (cusparsestruct->nrows) {
3113        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3115        #else
3116         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3117         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3118                                          matstruct->alpha_one, matstruct->descr, hybMat,
3119                                          xptr, beta,
3120                                          dptr));
3121        #endif
3122       }
3123     }
3124     PetscCall(PetscLogGpuTimeEnd());
3125 
3126     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3127       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3128         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3129           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3130         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3131           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3132         }
3133       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3134         PetscCall(VecSet_SeqCUDA(zz,0));
3135       }
3136 
3137       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3138       if (compressed) {
3139         PetscCall(PetscLogGpuTimeBegin());
3140         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3141            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3142            prevent that. So I just add a ScatterAdd kernel.
3143          */
3144        #if 0
3145         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3146         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3147                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3148                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3149                          VecCUDAPlusEquals());
3150        #else
3151         PetscInt n = matstruct->cprowIndices->size();
3152         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3153        #endif
3154         PetscCall(PetscLogGpuTimeEnd());
3155       }
3156     } else {
3157       if (yy && yy != zz) {
3158         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3159       }
3160     }
3161     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3162     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3163     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3164   } catch(char *ex) {
3165     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3166   }
3167   if (yy) {
3168     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3169   } else {
3170     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3171   }
3172   PetscFunctionReturn(0);
3173 }
3174 
3175 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3176 {
3177   PetscFunctionBegin;
3178   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3179   PetscFunctionReturn(0);
3180 }
3181 
3182 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3183 {
3184   PetscObjectState   onnz = A->nonzerostate;
3185   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3186 
3187   PetscFunctionBegin;
3188   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3189   if (onnz != A->nonzerostate && cusp->deviceMat) {
3190 
3191     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3192     PetscCallCUDA(cudaFree(cusp->deviceMat));
3193     cusp->deviceMat = NULL;
3194   }
3195   PetscFunctionReturn(0);
3196 }
3197 
3198 /* --------------------------------------------------------------------------------*/
3199 /*@
3200    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3201    (the default parallel PETSc format). This matrix will ultimately pushed down
3202    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3203    assembly performance the user should preallocate the matrix storage by setting
3204    the parameter nz (or the array nnz).  By setting these parameters accurately,
3205    performance during matrix assembly can be increased by more than a factor of 50.
3206 
3207    Collective
3208 
3209    Input Parameters:
3210 +  comm - MPI communicator, set to PETSC_COMM_SELF
3211 .  m - number of rows
3212 .  n - number of columns
3213 .  nz - number of nonzeros per row (same for all rows)
3214 -  nnz - array containing the number of nonzeros in the various rows
3215          (possibly different for each row) or NULL
3216 
3217    Output Parameter:
3218 .  A - the matrix
3219 
3220    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3221    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3222    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3223 
3224    Notes:
3225    If nnz is given then nz is ignored
3226 
3227    The AIJ format (also called the Yale sparse matrix format or
3228    compressed row storage), is fully compatible with standard Fortran 77
3229    storage.  That is, the stored row and column indices can begin at
3230    either one (as in Fortran) or zero.  See the users' manual for details.
3231 
3232    Specify the preallocated storage with either nz or nnz (not both).
3233    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3234    allocation.  For large problems you MUST preallocate memory or you
3235    will get TERRIBLE performance, see the users' manual chapter on matrices.
3236 
3237    By default, this format uses inodes (identical nodes) when possible, to
3238    improve numerical efficiency of matrix-vector products and solves. We
3239    search for consecutive rows with the same nonzero structure, thereby
3240    reusing matrix information to achieve increased efficiency.
3241 
3242    Level: intermediate
3243 
3244 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3245 @*/
3246 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3247 {
3248   PetscFunctionBegin;
3249   PetscCall(MatCreate(comm,A));
3250   PetscCall(MatSetSizes(*A,m,n,m,n));
3251   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3252   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3253   PetscFunctionReturn(0);
3254 }
3255 
3256 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3257 {
3258   PetscFunctionBegin;
3259   if (A->factortype == MAT_FACTOR_NONE) {
3260     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3261   } else {
3262     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3263   }
3264   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3265   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3266   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3267   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3268   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3269   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3270   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3271   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3272   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3273   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3274   PetscCall(MatDestroy_SeqAIJ(A));
3275   PetscFunctionReturn(0);
3276 }
3277 
3278 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3279 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3280 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3281 {
3282   PetscFunctionBegin;
3283   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3284   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3289 {
3290   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3291   Mat_SeqAIJCUSPARSE *cy;
3292   Mat_SeqAIJCUSPARSE *cx;
3293   PetscScalar        *ay;
3294   const PetscScalar  *ax;
3295   CsrMatrix          *csry,*csrx;
3296 
3297   PetscFunctionBegin;
3298   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3299   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3300   if (X->ops->axpy != Y->ops->axpy) {
3301     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3302     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3303     PetscFunctionReturn(0);
3304   }
3305   /* if we are here, it means both matrices are bound to GPU */
3306   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3307   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3308   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3309   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3310   csry = (CsrMatrix*)cy->mat->mat;
3311   csrx = (CsrMatrix*)cx->mat->mat;
3312   /* see if we can turn this into a cublas axpy */
3313   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3314     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3315     if (eq) {
3316       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3317     }
3318     if (eq) str = SAME_NONZERO_PATTERN;
3319   }
3320   /* spgeam is buggy with one column */
3321   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3322 
3323   if (str == SUBSET_NONZERO_PATTERN) {
3324     PetscScalar b = 1.0;
3325 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326     size_t      bufferSize;
3327     void        *buffer;
3328 #endif
3329 
3330     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3331     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3332     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3333 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3335                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3336                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3337                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3338     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3339     PetscCall(PetscLogGpuTimeBegin());
3340     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3341                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3342                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3343                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3344     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3345     PetscCall(PetscLogGpuTimeEnd());
3346     PetscCallCUDA(cudaFree(buffer));
3347 #else
3348     PetscCall(PetscLogGpuTimeBegin());
3349     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3350                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3351                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3352                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3353     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3354     PetscCall(PetscLogGpuTimeEnd());
3355 #endif
3356     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3357     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3358     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3359     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3360   } else if (str == SAME_NONZERO_PATTERN) {
3361     cublasHandle_t cublasv2handle;
3362     PetscBLASInt   one = 1, bnz = 1;
3363 
3364     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3365     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3366     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3367     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3368     PetscCall(PetscLogGpuTimeBegin());
3369     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3370     PetscCall(PetscLogGpuFlops(2.0*bnz));
3371     PetscCall(PetscLogGpuTimeEnd());
3372     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3373     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3374     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3375   } else {
3376     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3377     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3378   }
3379   PetscFunctionReturn(0);
3380 }
3381 
3382 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3383 {
3384   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3385   PetscScalar    *ay;
3386   cublasHandle_t cublasv2handle;
3387   PetscBLASInt   one = 1, bnz = 1;
3388 
3389   PetscFunctionBegin;
3390   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3391   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3392   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3393   PetscCall(PetscLogGpuTimeBegin());
3394   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3395   PetscCall(PetscLogGpuFlops(bnz));
3396   PetscCall(PetscLogGpuTimeEnd());
3397   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3398   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3399   PetscFunctionReturn(0);
3400 }
3401 
3402 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3403 {
3404   PetscBool      both = PETSC_FALSE;
3405   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3406 
3407   PetscFunctionBegin;
3408   if (A->factortype == MAT_FACTOR_NONE) {
3409     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3410     if (spptr->mat) {
3411       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3412       if (matrix->values) {
3413         both = PETSC_TRUE;
3414         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3415       }
3416     }
3417     if (spptr->matTranspose) {
3418       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3419       if (matrix->values) {
3420         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3421       }
3422     }
3423   }
3424   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3425   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3426   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3427   else A->offloadmask = PETSC_OFFLOAD_CPU;
3428   PetscFunctionReturn(0);
3429 }
3430 
3431 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3432 {
3433   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3434 
3435   PetscFunctionBegin;
3436   if (A->factortype != MAT_FACTOR_NONE) {
3437     A->boundtocpu = flg;
3438     PetscFunctionReturn(0);
3439   }
3440   if (flg) {
3441     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3442 
3443     A->ops->scale                     = MatScale_SeqAIJ;
3444     A->ops->axpy                      = MatAXPY_SeqAIJ;
3445     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3446     A->ops->mult                      = MatMult_SeqAIJ;
3447     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3448     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3449     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3450     A->ops->multhermitiantranspose    = NULL;
3451     A->ops->multhermitiantransposeadd = NULL;
3452     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3453     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3454     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3455     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3456     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3457     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3458     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3459     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
3460     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3461   } else {
3462     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3463     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3464     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3465     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3466     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3467     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3468     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3469     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3470     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3471     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3472     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3473     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3474     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3475     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3476     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3477     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3478     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3479 
3480     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3481     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3482     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3483     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3484     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3485     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3486    }
3487   A->boundtocpu = flg;
3488   if (flg && a->inode.size) {
3489     a->inode.use = PETSC_TRUE;
3490   } else {
3491     a->inode.use = PETSC_FALSE;
3492   }
3493   PetscFunctionReturn(0);
3494 }
3495 
3496 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3497 {
3498   Mat              B;
3499 
3500   PetscFunctionBegin;
3501   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3502   if (reuse == MAT_INITIAL_MATRIX) {
3503     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3504   } else if (reuse == MAT_REUSE_MATRIX) {
3505     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3506   }
3507   B = *newmat;
3508 
3509   PetscCall(PetscFree(B->defaultvectype));
3510   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3511 
3512   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3513     if (B->factortype == MAT_FACTOR_NONE) {
3514       Mat_SeqAIJCUSPARSE *spptr;
3515       PetscCall(PetscNew(&spptr));
3516       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3517       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3518       spptr->format     = MAT_CUSPARSE_CSR;
3519      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3520      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3521       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3522      #else
3523       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3524      #endif
3525       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3526       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3527      #endif
3528       B->spptr = spptr;
3529     } else {
3530       Mat_SeqAIJCUSPARSETriFactors *spptr;
3531 
3532       PetscCall(PetscNew(&spptr));
3533       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3534       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3535       B->spptr = spptr;
3536     }
3537     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3538   }
3539   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3540   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3541   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3542   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3543   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3544   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3545 
3546   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3547   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3548   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3549 #if defined(PETSC_HAVE_HYPRE)
3550   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3551 #endif
3552   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3553   PetscFunctionReturn(0);
3554 }
3555 
3556 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3557 {
3558   PetscFunctionBegin;
3559   PetscCall(MatCreate_SeqAIJ(B));
3560   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3561   PetscFunctionReturn(0);
3562 }
3563 
3564 /*MC
3565    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3566 
3567    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3568    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3569    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3570 
3571    Options Database Keys:
3572 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3573 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3574 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3575 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3576 
3577   Level: beginner
3578 
3579 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3580 M*/
3581 
3582 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3583 
3584 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3585 {
3586   PetscFunctionBegin;
3587   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3588   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3589   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3590   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3591   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3592 
3593   PetscFunctionReturn(0);
3594 }
3595 
3596 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3597 {
3598   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3599 
3600   PetscFunctionBegin;
3601   if (!cusp) PetscFunctionReturn(0);
3602   delete cusp->cooPerm;
3603   delete cusp->cooPerm_a;
3604   cusp->cooPerm = NULL;
3605   cusp->cooPerm_a = NULL;
3606   if (cusp->use_extended_coo) {
3607     PetscCallCUDA(cudaFree(cusp->jmap_d));
3608     PetscCallCUDA(cudaFree(cusp->perm_d));
3609   }
3610   cusp->use_extended_coo = PETSC_FALSE;
3611   PetscFunctionReturn(0);
3612 }
3613 
3614 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3615 {
3616   PetscFunctionBegin;
3617   if (*cusparsestruct) {
3618     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3619     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3620     delete (*cusparsestruct)->workVector;
3621     delete (*cusparsestruct)->rowoffsets_gpu;
3622     delete (*cusparsestruct)->cooPerm;
3623     delete (*cusparsestruct)->cooPerm_a;
3624     delete (*cusparsestruct)->csr2csc_i;
3625     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3626     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3627     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3628     PetscCall(PetscFree(*cusparsestruct));
3629   }
3630   PetscFunctionReturn(0);
3631 }
3632 
3633 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3634 {
3635   PetscFunctionBegin;
3636   if (*mat) {
3637     delete (*mat)->values;
3638     delete (*mat)->column_indices;
3639     delete (*mat)->row_offsets;
3640     delete *mat;
3641     *mat = 0;
3642   }
3643   PetscFunctionReturn(0);
3644 }
3645 
3646 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3647 {
3648   PetscFunctionBegin;
3649   if (*trifactor) {
3650     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3651     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3652     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3653     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3654     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3655    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3656     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3657    #endif
3658     PetscCall(PetscFree(*trifactor));
3659   }
3660   PetscFunctionReturn(0);
3661 }
3662 
3663 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3664 {
3665   CsrMatrix        *mat;
3666 
3667   PetscFunctionBegin;
3668   if (*matstruct) {
3669     if ((*matstruct)->mat) {
3670       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3671        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3672         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3673        #else
3674         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3675         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3676        #endif
3677       } else {
3678         mat = (CsrMatrix*)(*matstruct)->mat;
3679         CsrMatrix_Destroy(&mat);
3680       }
3681     }
3682     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3683     delete (*matstruct)->cprowIndices;
3684     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3685     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3686     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3687 
3688    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3689     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3690     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3691     for (int i=0; i<3; i++) {
3692       if (mdata->cuSpMV[i].initialized) {
3693         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3694         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3695         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3696       }
3697     }
3698    #endif
3699     delete *matstruct;
3700     *matstruct = NULL;
3701   }
3702   PetscFunctionReturn(0);
3703 }
3704 
3705 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3706 {
3707   PetscFunctionBegin;
3708   if (*trifactors) {
3709     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3710     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3711     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3712     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3713     delete (*trifactors)->rpermIndices;
3714     delete (*trifactors)->cpermIndices;
3715     delete (*trifactors)->workVector;
3716     (*trifactors)->rpermIndices = NULL;
3717     (*trifactors)->cpermIndices = NULL;
3718     (*trifactors)->workVector = NULL;
3719     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3720     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3721     (*trifactors)->init_dev_prop = PETSC_FALSE;
3722   }
3723   PetscFunctionReturn(0);
3724 }
3725 
3726 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3727 {
3728   cusparseHandle_t handle;
3729 
3730   PetscFunctionBegin;
3731   if (*trifactors) {
3732     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3733     if (handle = (*trifactors)->handle) {
3734       PetscCallCUSPARSE(cusparseDestroy(handle));
3735     }
3736     PetscCall(PetscFree(*trifactors));
3737   }
3738   PetscFunctionReturn(0);
3739 }
3740 
3741 struct IJCompare
3742 {
3743   __host__ __device__
3744   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3745   {
3746     if (t1.get<0>() < t2.get<0>()) return true;
3747     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3748     return false;
3749   }
3750 };
3751 
3752 struct IJEqual
3753 {
3754   __host__ __device__
3755   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3756   {
3757     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3758     return true;
3759   }
3760 };
3761 
3762 struct IJDiff
3763 {
3764   __host__ __device__
3765   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3766   {
3767     return t1 == t2 ? 0 : 1;
3768   }
3769 };
3770 
3771 struct IJSum
3772 {
3773   __host__ __device__
3774   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3775   {
3776     return t1||t2;
3777   }
3778 };
3779 
3780 #include <thrust/iterator/discard_iterator.h>
3781 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3782 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3783 {
3784   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3785   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3786   THRUSTARRAY                           *cooPerm_v = NULL;
3787   thrust::device_ptr<const PetscScalar> d_v;
3788   CsrMatrix                             *matrix;
3789   PetscInt                              n;
3790 
3791   PetscFunctionBegin;
3792   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3793   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3794   if (!cusp->cooPerm) {
3795     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3796     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3797     PetscFunctionReturn(0);
3798   }
3799   matrix = (CsrMatrix*)cusp->mat->mat;
3800   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3801   if (!v) {
3802     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3803     goto finalize;
3804   }
3805   n = cusp->cooPerm->size();
3806   if (isCudaMem(v)) {
3807     d_v = thrust::device_pointer_cast(v);
3808   } else {
3809     cooPerm_v = new THRUSTARRAY(n);
3810     cooPerm_v->assign(v,v+n);
3811     d_v = cooPerm_v->data();
3812     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3813   }
3814   PetscCall(PetscLogGpuTimeBegin());
3815   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3816     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3817       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3818       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3819       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3820         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3821         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3822       */
3823       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3824       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3825       delete cooPerm_w;
3826     } else {
3827       /* all nonzeros in d_v[] are unique entries */
3828       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3829                                                                 matrix->values->begin()));
3830       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3831                                                                 matrix->values->end()));
3832       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3833     }
3834   } else {
3835     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3836       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3837       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3838     } else {
3839       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3840                                                                 matrix->values->begin()));
3841       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3842                                                                 matrix->values->end()));
3843       thrust::for_each(zibit,zieit,VecCUDAEquals());
3844     }
3845   }
3846   PetscCall(PetscLogGpuTimeEnd());
3847 finalize:
3848   delete cooPerm_v;
3849   A->offloadmask = PETSC_OFFLOAD_GPU;
3850   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3851   /* shorter version of MatAssemblyEnd_SeqAIJ */
3852   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3853   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3854   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3855   a->reallocs         = 0;
3856   A->info.mallocs    += 0;
3857   A->info.nz_unneeded = 0;
3858   A->assembled = A->was_assembled = PETSC_TRUE;
3859   A->num_ass++;
3860   PetscFunctionReturn(0);
3861 }
3862 
3863 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3864 {
3865   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3866 
3867   PetscFunctionBegin;
3868   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3869   if (!cusp) PetscFunctionReturn(0);
3870   if (destroy) {
3871     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3872     delete cusp->csr2csc_i;
3873     cusp->csr2csc_i = NULL;
3874   }
3875   A->transupdated = PETSC_FALSE;
3876   PetscFunctionReturn(0);
3877 }
3878 
3879 #include <thrust/binary_search.h>
3880 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3881 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3882 {
3883   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3884   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3885   PetscInt           cooPerm_n, nzr = 0;
3886 
3887   PetscFunctionBegin;
3888   PetscCall(PetscLayoutSetUp(A->rmap));
3889   PetscCall(PetscLayoutSetUp(A->cmap));
3890   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3891   if (n != cooPerm_n) {
3892     delete cusp->cooPerm;
3893     delete cusp->cooPerm_a;
3894     cusp->cooPerm = NULL;
3895     cusp->cooPerm_a = NULL;
3896   }
3897   if (n) {
3898     THRUSTINTARRAY d_i(n);
3899     THRUSTINTARRAY d_j(n);
3900     THRUSTINTARRAY ii(A->rmap->n);
3901 
3902     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3903     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3904 
3905     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3906     d_i.assign(coo_i,coo_i+n);
3907     d_j.assign(coo_j,coo_j+n);
3908 
3909     /* Ex.
3910       n = 6
3911       coo_i = [3,3,1,4,1,4]
3912       coo_j = [3,2,2,5,2,6]
3913     */
3914     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3915     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3916 
3917     PetscCall(PetscLogGpuTimeBegin());
3918     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3919     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3920     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3921     THRUSTINTARRAY w = d_j;
3922 
3923     /*
3924       d_i     = [1,1,3,3,4,4]
3925       d_j     = [2,2,2,3,5,6]
3926       cooPerm = [2,4,1,0,3,5]
3927     */
3928     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3929 
3930     /*
3931       d_i     = [1,3,3,4,4,x]
3932                             ^ekey
3933       d_j     = [2,2,3,5,6,x]
3934                            ^nekye
3935     */
3936     if (nekey == ekey) { /* all entries are unique */
3937       delete cusp->cooPerm_a;
3938       cusp->cooPerm_a = NULL;
3939     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3940       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3941       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3942       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3943       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3944       w[0] = 0;
3945       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3946       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3947     }
3948     thrust::counting_iterator<PetscInt> search_begin(0);
3949     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3950                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3951                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3952     PetscCall(PetscLogGpuTimeEnd());
3953 
3954     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3955     a->singlemalloc = PETSC_FALSE;
3956     a->free_a       = PETSC_TRUE;
3957     a->free_ij      = PETSC_TRUE;
3958     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3959     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3960     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3961     a->nz = a->maxnz = a->i[A->rmap->n];
3962     a->rmax = 0;
3963     PetscCall(PetscMalloc1(a->nz,&a->a));
3964     PetscCall(PetscMalloc1(a->nz,&a->j));
3965     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3966     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3967     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3968     for (PetscInt i = 0; i < A->rmap->n; i++) {
3969       const PetscInt nnzr = a->i[i+1] - a->i[i];
3970       nzr += (PetscInt)!!(nnzr);
3971       a->ilen[i] = a->imax[i] = nnzr;
3972       a->rmax = PetscMax(a->rmax,nnzr);
3973     }
3974     a->nonzerorowcnt = nzr;
3975     A->preallocated = PETSC_TRUE;
3976     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3977     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3978   } else {
3979     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3980   }
3981   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3982 
3983   /* We want to allocate the CUSPARSE struct for matvec now.
3984      The code is so convoluted now that I prefer to copy zeros */
3985   PetscCall(PetscArrayzero(a->a,a->nz));
3986   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3987   A->offloadmask = PETSC_OFFLOAD_CPU;
3988   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3989   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3990   PetscFunctionReturn(0);
3991 }
3992 
3993 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3994 {
3995   Mat_SeqAIJ         *seq;
3996   Mat_SeqAIJCUSPARSE *dev;
3997   PetscBool          coo_basic = PETSC_TRUE;
3998   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3999 
4000   PetscFunctionBegin;
4001   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4002   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4003   if (coo_i) {
4004     PetscCall(PetscGetMemType(coo_i,&mtype));
4005     if (PetscMemTypeHost(mtype)) {
4006       for (PetscCount k=0; k<coo_n; k++) {
4007         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4008       }
4009     }
4010   }
4011 
4012   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4013     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4014   } else {
4015     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4016     mat->offloadmask = PETSC_OFFLOAD_CPU;
4017     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4018     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4019     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4020     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4021     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4022     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4023     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4024     dev->use_extended_coo = PETSC_TRUE;
4025   }
4026   PetscFunctionReturn(0);
4027 }
4028 
4029 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4030 {
4031   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4032   const PetscCount  grid_size = gridDim.x * blockDim.x;
4033   for (; i<nnz; i+= grid_size) {
4034     PetscScalar sum = 0.0;
4035     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4036     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4037   }
4038 }
4039 
4040 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4041 {
4042   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4043   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4044   PetscCount          Annz = seq->nz;
4045   PetscMemType        memtype;
4046   const PetscScalar   *v1 = v;
4047   PetscScalar         *Aa;
4048 
4049   PetscFunctionBegin;
4050   if (dev->use_extended_coo) {
4051     PetscCall(PetscGetMemType(v,&memtype));
4052     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4053       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4054       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4055     }
4056 
4057     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4058     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4059 
4060     if (Annz) {
4061       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4062       PetscCallCUDA(cudaPeekAtLastError());
4063     }
4064 
4065     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4066     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4067 
4068     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4069   } else {
4070     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4071   }
4072   PetscFunctionReturn(0);
4073 }
4074 
4075 /*@C
4076     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4077 
4078    Not collective
4079 
4080     Input Parameters:
4081 +   A - the matrix
4082 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4083 
4084     Output Parameters:
4085 +   ia - the CSR row pointers
4086 -   ja - the CSR column indices
4087 
4088     Level: developer
4089 
4090     Notes:
4091       When compressed is true, the CSR structure does not contain empty rows
4092 
4093 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4094 @*/
4095 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4096 {
4097   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4098   CsrMatrix          *csr;
4099   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4100 
4101   PetscFunctionBegin;
4102   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4103   if (!i || !j) PetscFunctionReturn(0);
4104   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4105   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4106   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4107   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4108   csr = (CsrMatrix*)cusp->mat->mat;
4109   if (i) {
4110     if (!compressed && a->compressedrow.use) { /* need full row offset */
4111       if (!cusp->rowoffsets_gpu) {
4112         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4113         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4114         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4115       }
4116       *i = cusp->rowoffsets_gpu->data().get();
4117     } else *i = csr->row_offsets->data().get();
4118   }
4119   if (j) *j = csr->column_indices->data().get();
4120   PetscFunctionReturn(0);
4121 }
4122 
4123 /*@C
4124     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4125 
4126    Not collective
4127 
4128     Input Parameters:
4129 +   A - the matrix
4130 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4131 
4132     Output Parameters:
4133 +   ia - the CSR row pointers
4134 -   ja - the CSR column indices
4135 
4136     Level: developer
4137 
4138 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4139 @*/
4140 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4141 {
4142   PetscFunctionBegin;
4143   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4144   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4145   if (i) *i = NULL;
4146   if (j) *j = NULL;
4147   PetscFunctionReturn(0);
4148 }
4149 
4150 /*@C
4151    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4152 
4153    Not Collective
4154 
4155    Input Parameter:
4156 .   A - a MATSEQAIJCUSPARSE matrix
4157 
4158    Output Parameter:
4159 .   a - pointer to the device data
4160 
4161    Level: developer
4162 
4163    Notes: may trigger host-device copies if up-to-date matrix data is on host
4164 
4165 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4166 @*/
4167 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4168 {
4169   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4170   CsrMatrix          *csr;
4171 
4172   PetscFunctionBegin;
4173   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4174   PetscValidPointer(a,2);
4175   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4176   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4177   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4178   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4179   csr = (CsrMatrix*)cusp->mat->mat;
4180   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4181   *a = csr->values->data().get();
4182   PetscFunctionReturn(0);
4183 }
4184 
4185 /*@C
4186    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4187 
4188    Not Collective
4189 
4190    Input Parameter:
4191 .   A - a MATSEQAIJCUSPARSE matrix
4192 
4193    Output Parameter:
4194 .   a - pointer to the device data
4195 
4196    Level: developer
4197 
4198 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4199 @*/
4200 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4201 {
4202   PetscFunctionBegin;
4203   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4204   PetscValidPointer(a,2);
4205   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4206   *a = NULL;
4207   PetscFunctionReturn(0);
4208 }
4209 
4210 /*@C
4211    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4212 
4213    Not Collective
4214 
4215    Input Parameter:
4216 .   A - a MATSEQAIJCUSPARSE matrix
4217 
4218    Output Parameter:
4219 .   a - pointer to the device data
4220 
4221    Level: developer
4222 
4223    Notes: may trigger host-device copies if up-to-date matrix data is on host
4224 
4225 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4226 @*/
4227 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4228 {
4229   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4230   CsrMatrix          *csr;
4231 
4232   PetscFunctionBegin;
4233   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4234   PetscValidPointer(a,2);
4235   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4236   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4237   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4238   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4239   csr = (CsrMatrix*)cusp->mat->mat;
4240   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4241   *a = csr->values->data().get();
4242   A->offloadmask = PETSC_OFFLOAD_GPU;
4243   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4244   PetscFunctionReturn(0);
4245 }
4246 /*@C
4247    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4248 
4249    Not Collective
4250 
4251    Input Parameter:
4252 .   A - a MATSEQAIJCUSPARSE matrix
4253 
4254    Output Parameter:
4255 .   a - pointer to the device data
4256 
4257    Level: developer
4258 
4259 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4260 @*/
4261 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4262 {
4263   PetscFunctionBegin;
4264   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4265   PetscValidPointer(a,2);
4266   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4267   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4268   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4269   *a = NULL;
4270   PetscFunctionReturn(0);
4271 }
4272 
4273 /*@C
4274    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4275 
4276    Not Collective
4277 
4278    Input Parameter:
4279 .   A - a MATSEQAIJCUSPARSE matrix
4280 
4281    Output Parameter:
4282 .   a - pointer to the device data
4283 
4284    Level: developer
4285 
4286    Notes: does not trigger host-device copies and flags data validity on the GPU
4287 
4288 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4289 @*/
4290 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4291 {
4292   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4293   CsrMatrix          *csr;
4294 
4295   PetscFunctionBegin;
4296   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4297   PetscValidPointer(a,2);
4298   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4299   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4300   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4301   csr = (CsrMatrix*)cusp->mat->mat;
4302   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4303   *a = csr->values->data().get();
4304   A->offloadmask = PETSC_OFFLOAD_GPU;
4305   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4306   PetscFunctionReturn(0);
4307 }
4308 
4309 /*@C
4310    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4311 
4312    Not Collective
4313 
4314    Input Parameter:
4315 .   A - a MATSEQAIJCUSPARSE matrix
4316 
4317    Output Parameter:
4318 .   a - pointer to the device data
4319 
4320    Level: developer
4321 
4322 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4323 @*/
4324 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4325 {
4326   PetscFunctionBegin;
4327   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4328   PetscValidPointer(a,2);
4329   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4330   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4331   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4332   *a = NULL;
4333   PetscFunctionReturn(0);
4334 }
4335 
4336 struct IJCompare4
4337 {
4338   __host__ __device__
4339   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4340   {
4341     if (t1.get<0>() < t2.get<0>()) return true;
4342     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4343     return false;
4344   }
4345 };
4346 
4347 struct Shift
4348 {
4349   int _shift;
4350 
4351   Shift(int shift) : _shift(shift) {}
4352   __host__ __device__
4353   inline int operator() (const int &c)
4354   {
4355     return c + _shift;
4356   }
4357 };
4358 
4359 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4360 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4361 {
4362   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4363   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4364   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4365   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4366   PetscInt                     Annz,Bnnz;
4367   cusparseStatus_t             stat;
4368   PetscInt                     i,m,n,zero = 0;
4369 
4370   PetscFunctionBegin;
4371   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4372   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4373   PetscValidPointer(C,4);
4374   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4375   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4376   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4377   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4378   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4379   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4380   if (reuse == MAT_INITIAL_MATRIX) {
4381     m     = A->rmap->n;
4382     n     = A->cmap->n + B->cmap->n;
4383     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4384     PetscCall(MatSetSizes(*C,m,n,m,n));
4385     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4386     c     = (Mat_SeqAIJ*)(*C)->data;
4387     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4388     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4389     Ccsr  = new CsrMatrix;
4390     Cmat->cprowIndices      = NULL;
4391     c->compressedrow.use    = PETSC_FALSE;
4392     c->compressedrow.nrows  = 0;
4393     c->compressedrow.i      = NULL;
4394     c->compressedrow.rindex = NULL;
4395     Ccusp->workVector       = NULL;
4396     Ccusp->nrows    = m;
4397     Ccusp->mat      = Cmat;
4398     Ccusp->mat->mat = Ccsr;
4399     Ccsr->num_rows  = m;
4400     Ccsr->num_cols  = n;
4401     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4402     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4403     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4404     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4405     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4406     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4407     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4408     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4409     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4410     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4411     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4412     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4413     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4414 
4415     Acsr = (CsrMatrix*)Acusp->mat->mat;
4416     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4417     Annz = (PetscInt)Acsr->column_indices->size();
4418     Bnnz = (PetscInt)Bcsr->column_indices->size();
4419     c->nz = Annz + Bnnz;
4420     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4421     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4422     Ccsr->values = new THRUSTARRAY(c->nz);
4423     Ccsr->num_entries = c->nz;
4424     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4425     if (c->nz) {
4426       auto Acoo = new THRUSTINTARRAY32(Annz);
4427       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4428       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4429       THRUSTINTARRAY32 *Aroff,*Broff;
4430 
4431       if (a->compressedrow.use) { /* need full row offset */
4432         if (!Acusp->rowoffsets_gpu) {
4433           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4434           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4435           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4436         }
4437         Aroff = Acusp->rowoffsets_gpu;
4438       } else Aroff = Acsr->row_offsets;
4439       if (b->compressedrow.use) { /* need full row offset */
4440         if (!Bcusp->rowoffsets_gpu) {
4441           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4442           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4443           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4444         }
4445         Broff = Bcusp->rowoffsets_gpu;
4446       } else Broff = Bcsr->row_offsets;
4447       PetscCall(PetscLogGpuTimeBegin());
4448       stat = cusparseXcsr2coo(Acusp->handle,
4449                               Aroff->data().get(),
4450                               Annz,
4451                               m,
4452                               Acoo->data().get(),
4453                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4454       stat = cusparseXcsr2coo(Bcusp->handle,
4455                               Broff->data().get(),
4456                               Bnnz,
4457                               m,
4458                               Bcoo->data().get(),
4459                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4460       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4461       auto Aperm = thrust::make_constant_iterator(1);
4462       auto Bperm = thrust::make_constant_iterator(0);
4463 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4464       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4465       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4466 #else
4467       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4468       auto Bcib = Bcsr->column_indices->begin();
4469       auto Bcie = Bcsr->column_indices->end();
4470       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4471 #endif
4472       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4473       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4474       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4475       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4476       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4477       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4478       auto p1 = Ccusp->cooPerm->begin();
4479       auto p2 = Ccusp->cooPerm->begin();
4480       thrust::advance(p2,Annz);
4481       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4482 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4483       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4484 #endif
4485       auto cci = thrust::make_counting_iterator(zero);
4486       auto cce = thrust::make_counting_iterator(c->nz);
4487 #if 0 //Errors on SUMMIT cuda 11.1.0
4488       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4489 #else
4490       auto pred = thrust::identity<int>();
4491       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4492       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4493 #endif
4494       stat = cusparseXcoo2csr(Ccusp->handle,
4495                               Ccoo->data().get(),
4496                               c->nz,
4497                               m,
4498                               Ccsr->row_offsets->data().get(),
4499                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4500       PetscCall(PetscLogGpuTimeEnd());
4501       delete wPerm;
4502       delete Acoo;
4503       delete Bcoo;
4504       delete Ccoo;
4505 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4506       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4507                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4508                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4509                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4510 #endif
4511       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4512         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4513         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4514         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4515         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4516         CsrMatrix *CcsrT = new CsrMatrix;
4517         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4518         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4519 
4520         (*C)->form_explicit_transpose = PETSC_TRUE;
4521         (*C)->transupdated = PETSC_TRUE;
4522         Ccusp->rowoffsets_gpu = NULL;
4523         CmatT->cprowIndices = NULL;
4524         CmatT->mat = CcsrT;
4525         CcsrT->num_rows = n;
4526         CcsrT->num_cols = m;
4527         CcsrT->num_entries = c->nz;
4528 
4529         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4530         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4531         CcsrT->values = new THRUSTARRAY(c->nz);
4532 
4533         PetscCall(PetscLogGpuTimeBegin());
4534         auto rT = CcsrT->row_offsets->begin();
4535         if (AT) {
4536           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4537           thrust::advance(rT,-1);
4538         }
4539         if (BT) {
4540           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4541           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4542           thrust::copy(titb,tite,rT);
4543         }
4544         auto cT = CcsrT->column_indices->begin();
4545         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4546         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4547         auto vT = CcsrT->values->begin();
4548         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4549         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4550         PetscCall(PetscLogGpuTimeEnd());
4551 
4552         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4553         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4554         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4555         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4556         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4557         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4558         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4559         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4560         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4561 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4562         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4563                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4564                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4565                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4566 #endif
4567         Ccusp->matTranspose = CmatT;
4568       }
4569     }
4570 
4571     c->singlemalloc = PETSC_FALSE;
4572     c->free_a       = PETSC_TRUE;
4573     c->free_ij      = PETSC_TRUE;
4574     PetscCall(PetscMalloc1(m+1,&c->i));
4575     PetscCall(PetscMalloc1(c->nz,&c->j));
4576     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4577       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4578       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4579       ii   = *Ccsr->row_offsets;
4580       jj   = *Ccsr->column_indices;
4581       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4582       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4583     } else {
4584       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4585       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4586     }
4587     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4588     PetscCall(PetscMalloc1(m,&c->ilen));
4589     PetscCall(PetscMalloc1(m,&c->imax));
4590     c->maxnz = c->nz;
4591     c->nonzerorowcnt = 0;
4592     c->rmax = 0;
4593     for (i = 0; i < m; i++) {
4594       const PetscInt nn = c->i[i+1] - c->i[i];
4595       c->ilen[i] = c->imax[i] = nn;
4596       c->nonzerorowcnt += (PetscInt)!!nn;
4597       c->rmax = PetscMax(c->rmax,nn);
4598     }
4599     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4600     PetscCall(PetscMalloc1(c->nz,&c->a));
4601     (*C)->nonzerostate++;
4602     PetscCall(PetscLayoutSetUp((*C)->rmap));
4603     PetscCall(PetscLayoutSetUp((*C)->cmap));
4604     Ccusp->nonzerostate = (*C)->nonzerostate;
4605     (*C)->preallocated  = PETSC_TRUE;
4606   } else {
4607     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4608     c = (Mat_SeqAIJ*)(*C)->data;
4609     if (c->nz) {
4610       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4611       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4612       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4613       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4614       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4615       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4616       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4617       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4618       Acsr = (CsrMatrix*)Acusp->mat->mat;
4619       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4620       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4621       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4622       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4623       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4624       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4625       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4626       auto pmid = Ccusp->cooPerm->begin();
4627       thrust::advance(pmid,Acsr->num_entries);
4628       PetscCall(PetscLogGpuTimeBegin());
4629       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4630                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4631       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4632                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4633       thrust::for_each(zibait,zieait,VecCUDAEquals());
4634       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4635                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4636       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4637                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4638       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4639       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4640       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4641         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4642         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4643         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4644         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4645         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4646         auto vT = CcsrT->values->begin();
4647         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4648         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4649         (*C)->transupdated = PETSC_TRUE;
4650       }
4651       PetscCall(PetscLogGpuTimeEnd());
4652     }
4653   }
4654   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4655   (*C)->assembled     = PETSC_TRUE;
4656   (*C)->was_assembled = PETSC_FALSE;
4657   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4658   PetscFunctionReturn(0);
4659 }
4660 
4661 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4662 {
4663   bool              dmem;
4664   const PetscScalar *av;
4665 
4666   PetscFunctionBegin;
4667   dmem = isCudaMem(v);
4668   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4669   if (n && idx) {
4670     THRUSTINTARRAY widx(n);
4671     widx.assign(idx,idx+n);
4672     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4673 
4674     THRUSTARRAY *w = NULL;
4675     thrust::device_ptr<PetscScalar> dv;
4676     if (dmem) {
4677       dv = thrust::device_pointer_cast(v);
4678     } else {
4679       w = new THRUSTARRAY(n);
4680       dv = w->data();
4681     }
4682     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4683 
4684     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4685     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4686     thrust::for_each(zibit,zieit,VecCUDAEquals());
4687     if (w) {
4688       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4689     }
4690     delete w;
4691   } else {
4692     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4693   }
4694   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4695   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4696   PetscFunctionReturn(0);
4697 }
4698