xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1179163e0bf5c4dd309079707fd3c0dfe8d44eee)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
96 {
97   PetscFunctionBegin;
98   *type = MATSOLVERCUSPARSE;
99   PetscFunctionReturn(0);
100 }
101 
102 /*MC
103   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108   algorithms are not recommended. This class does NOT support direct solver operations.
109 
110   Level: beginner
111 
112 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
113 M*/
114 
115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
116 {
117   PetscInt       n = A->rmap->n;
118 
119   PetscFunctionBegin;
120   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
121   PetscCall(MatSetSizes(*B,n,n,n,n));
122   (*B)->factortype = ftype;
123   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
124 
125   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
127     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
128     if (!A->boundtocpu) {
129       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
130       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
131     } else {
132       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
133       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
134     }
135     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
136     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
137     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
139     if (!A->boundtocpu) {
140       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
142     } else {
143       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
144       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
145     }
146     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
147     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
148   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149 
150   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
151   (*B)->canuseordering = PETSC_TRUE;
152   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
153   PetscFunctionReturn(0);
154 }
155 
156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157 {
158   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
159 
160   PetscFunctionBegin;
161   switch (op) {
162   case MAT_CUSPARSE_MULT:
163     cusparsestruct->format = format;
164     break;
165   case MAT_CUSPARSE_ALL:
166     cusparsestruct->format = format;
167     break;
168   default:
169     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@
175    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176    operation. Only the MatMult operation can use different GPU storage formats
177    for MPIAIJCUSPARSE matrices.
178    Not Collective
179 
180    Input Parameters:
181 +  A - Matrix of type SEQAIJCUSPARSE
182 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
183 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184 
185    Output Parameter:
186 
187    Level: intermediate
188 
189 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
190 @*/
191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192 {
193   PetscFunctionBegin;
194   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196   PetscFunctionReturn(0);
197 }
198 
199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200 {
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203   PetscFunctionBegin;
204   cusparsestruct->use_cpu_solve = use_cpu;
205   PetscFunctionReturn(0);
206 }
207 
208 /*@
209    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210 
211    Input Parameters:
212 +  A - Matrix of type SEQAIJCUSPARSE
213 -  use_cpu - set flag for using the built-in CPU MatSolve
214 
215    Output Parameter:
216 
217    Notes:
218    The cuSparse LU solver currently computes the factors with the built-in CPU method
219    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221 
222    Level: intermediate
223 
224 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
225 @*/
226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231   PetscFunctionReturn(0);
232 }
233 
234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235 {
236   PetscFunctionBegin;
237   switch (op) {
238     case MAT_FORM_EXPLICIT_TRANSPOSE:
239       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
240       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
241       A->form_explicit_transpose = flg;
242       break;
243     default:
244       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
245       break;
246   }
247   PetscFunctionReturn(0);
248 }
249 
250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251 
252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253 {
254   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255   IS             isrow = b->row,iscol = b->col;
256   PetscBool      row_identity,col_identity;
257   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258 
259   PetscFunctionBegin;
260   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
261   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262   B->offloadmask = PETSC_OFFLOAD_CPU;
263   /* determine which version of MatSolve needs to be used. */
264   PetscCall(ISIdentity(isrow,&row_identity));
265   PetscCall(ISIdentity(iscol,&col_identity));
266   if (row_identity && col_identity) {
267     if (!cusparsestruct->use_cpu_solve) {
268       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
269       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
270     }
271     B->ops->matsolve = NULL;
272     B->ops->matsolvetranspose = NULL;
273   } else {
274     if (!cusparsestruct->use_cpu_solve) {
275       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
276       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
277     }
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   }
281 
282   /* get the triangular factors */
283   if (!cusparsestruct->use_cpu_solve) {
284     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
290 {
291   MatCUSPARSEStorageFormat format;
292   PetscBool                flg;
293   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
294 
295   PetscFunctionBegin;
296   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
297   if (A->factortype == MAT_FACTOR_NONE) {
298     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
299                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
300     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
301 
302     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
303                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
304     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
305     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
306     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
308     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
309                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
310     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
311 #if CUSPARSE_VERSION > 11301
312     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
313 #else
314     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
315 #endif
316     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
317                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
318     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
319 
320     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
321                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
322     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
323    #endif
324   }
325   PetscOptionsHeadEnd();
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332 
333   PetscFunctionBegin;
334   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
335   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
336   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
341 {
342   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
343 
344   PetscFunctionBegin;
345   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
346   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
347   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
348   PetscFunctionReturn(0);
349 }
350 
351 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
352 {
353   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
354 
355   PetscFunctionBegin;
356   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
357   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
358   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
359   PetscFunctionReturn(0);
360 }
361 
362 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
363 {
364   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
365 
366   PetscFunctionBegin;
367   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
368   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
369   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
374 {
375   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
376   PetscInt                          n = A->rmap->n;
377   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
379   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
380   const MatScalar                   *aa = a->a,*v;
381   PetscInt                          *AiLo, *AjLo;
382   PetscInt                          i,nz, nzLower, offset, rowOffset;
383 
384   PetscFunctionBegin;
385   if (!n) PetscFunctionReturn(0);
386   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
387     try {
388       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
389       nzLower=n+ai[n]-ai[1];
390       if (!loTriFactor) {
391         PetscScalar                       *AALo;
392 
393         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
394 
395         /* Allocate Space for the lower triangular matrix */
396         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
397         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
398 
399         /* Fill the lower triangular matrix */
400         AiLo[0]  = (PetscInt) 0;
401         AiLo[n]  = nzLower;
402         AjLo[0]  = (PetscInt) 0;
403         AALo[0]  = (MatScalar) 1.0;
404         v        = aa;
405         vi       = aj;
406         offset   = 1;
407         rowOffset= 1;
408         for (i=1; i<n; i++) {
409           nz = ai[i+1] - ai[i];
410           /* additional 1 for the term on the diagonal */
411           AiLo[i]    = rowOffset;
412           rowOffset += nz+1;
413 
414           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
415           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
416 
417           offset      += nz;
418           AjLo[offset] = (PetscInt) i;
419           AALo[offset] = (MatScalar) 1.0;
420           offset      += 1;
421 
422           v  += nz;
423           vi += nz;
424         }
425 
426         /* allocate space for the triangular factor information */
427         PetscCall(PetscNew(&loTriFactor));
428         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
429         /* Create the matrix description */
430         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
431         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
432        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
433         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
434        #else
435         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
436        #endif
437         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
438         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
439 
440         /* set the operation */
441         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
442 
443         /* set the matrix */
444         loTriFactor->csrMat = new CsrMatrix;
445         loTriFactor->csrMat->num_rows = n;
446         loTriFactor->csrMat->num_cols = n;
447         loTriFactor->csrMat->num_entries = nzLower;
448 
449         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
450         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
451 
452         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
453         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
454 
455         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
456         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
457 
458         /* Create the solve analysis information */
459         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
460         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
461       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
462         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
463                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
464                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
465                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
466                                                &loTriFactor->solveBufferSize));
467         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
468       #endif
469 
470         /* perform the solve analysis */
471         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
472                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
473                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
474                                          loTriFactor->csrMat->column_indices->data().get(),
475                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
476                                          loTriFactor->solveInfo,
477                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
478                                          #else
479                                          loTriFactor->solveInfo));
480                                          #endif
481         PetscCallCUDA(WaitForCUDA());
482         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
483 
484         /* assign the pointer */
485         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
486         loTriFactor->AA_h = AALo;
487         PetscCallCUDA(cudaFreeHost(AiLo));
488         PetscCallCUDA(cudaFreeHost(AjLo));
489         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
490       } else { /* update values only */
491         if (!loTriFactor->AA_h) {
492           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
493         }
494         /* Fill the lower triangular matrix */
495         loTriFactor->AA_h[0]  = 1.0;
496         v        = aa;
497         vi       = aj;
498         offset   = 1;
499         for (i=1; i<n; i++) {
500           nz = ai[i+1] - ai[i];
501           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
502           offset      += nz;
503           loTriFactor->AA_h[offset] = 1.0;
504           offset      += 1;
505           v  += nz;
506         }
507         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
508         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
509       }
510     } catch(char *ex) {
511       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
512     }
513   }
514   PetscFunctionReturn(0);
515 }
516 
517 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
518 {
519   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
520   PetscInt                          n = A->rmap->n;
521   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
522   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
523   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
524   const MatScalar                   *aa = a->a,*v;
525   PetscInt                          *AiUp, *AjUp;
526   PetscInt                          i,nz, nzUpper, offset;
527 
528   PetscFunctionBegin;
529   if (!n) PetscFunctionReturn(0);
530   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
531     try {
532       /* next, figure out the number of nonzeros in the upper triangular matrix. */
533       nzUpper = adiag[0]-adiag[n];
534       if (!upTriFactor) {
535         PetscScalar *AAUp;
536 
537         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
538 
539         /* Allocate Space for the upper triangular matrix */
540         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
541         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
542 
543         /* Fill the upper triangular matrix */
544         AiUp[0]=(PetscInt) 0;
545         AiUp[n]=nzUpper;
546         offset = nzUpper;
547         for (i=n-1; i>=0; i--) {
548           v  = aa + adiag[i+1] + 1;
549           vi = aj + adiag[i+1] + 1;
550 
551           /* number of elements NOT on the diagonal */
552           nz = adiag[i] - adiag[i+1]-1;
553 
554           /* decrement the offset */
555           offset -= (nz+1);
556 
557           /* first, set the diagonal elements */
558           AjUp[offset] = (PetscInt) i;
559           AAUp[offset] = (MatScalar)1./v[nz];
560           AiUp[i]      = AiUp[i+1] - (nz+1);
561 
562           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
563           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
564         }
565 
566         /* allocate space for the triangular factor information */
567         PetscCall(PetscNew(&upTriFactor));
568         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
569 
570         /* Create the matrix description */
571         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
572         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
573        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
574         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
575        #else
576         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
577        #endif
578         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
579         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
580 
581         /* set the operation */
582         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
583 
584         /* set the matrix */
585         upTriFactor->csrMat = new CsrMatrix;
586         upTriFactor->csrMat->num_rows = n;
587         upTriFactor->csrMat->num_cols = n;
588         upTriFactor->csrMat->num_entries = nzUpper;
589 
590         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
591         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
592 
593         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
594         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
595 
596         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
597         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
598 
599         /* Create the solve analysis information */
600         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
601         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
602       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
603         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
604                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
605                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
606                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
607                                                &upTriFactor->solveBufferSize));
608         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
609       #endif
610 
611         /* perform the solve analysis */
612         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
613                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
614                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
615                                          upTriFactor->csrMat->column_indices->data().get(),
616                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
617                                          upTriFactor->solveInfo,
618                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
619                                          #else
620                                          upTriFactor->solveInfo));
621                                          #endif
622         PetscCallCUDA(WaitForCUDA());
623         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
624 
625         /* assign the pointer */
626         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
627         upTriFactor->AA_h = AAUp;
628         PetscCallCUDA(cudaFreeHost(AiUp));
629         PetscCallCUDA(cudaFreeHost(AjUp));
630         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
631       } else {
632         if (!upTriFactor->AA_h) {
633           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
634         }
635         /* Fill the upper triangular matrix */
636         offset = nzUpper;
637         for (i=n-1; i>=0; i--) {
638           v  = aa + adiag[i+1] + 1;
639 
640           /* number of elements NOT on the diagonal */
641           nz = adiag[i] - adiag[i+1]-1;
642 
643           /* decrement the offset */
644           offset -= (nz+1);
645 
646           /* first, set the diagonal elements */
647           upTriFactor->AA_h[offset] = 1./v[nz];
648           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
649         }
650         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
651         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
652       }
653     } catch(char *ex) {
654       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
655     }
656   }
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
661 {
662   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
663   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
664   IS                           isrow = a->row,iscol = a->icol;
665   PetscBool                    row_identity,col_identity;
666   PetscInt                     n = A->rmap->n;
667 
668   PetscFunctionBegin;
669   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
670   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
671   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
672 
673   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
674   cusparseTriFactors->nnz=a->nz;
675 
676   A->offloadmask = PETSC_OFFLOAD_BOTH;
677   /* lower triangular indices */
678   PetscCall(ISIdentity(isrow,&row_identity));
679   if (!row_identity && !cusparseTriFactors->rpermIndices) {
680     const PetscInt *r;
681 
682     PetscCall(ISGetIndices(isrow,&r));
683     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
684     cusparseTriFactors->rpermIndices->assign(r, r+n);
685     PetscCall(ISRestoreIndices(isrow,&r));
686     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
687   }
688 
689   /* upper triangular indices */
690   PetscCall(ISIdentity(iscol,&col_identity));
691   if (!col_identity && !cusparseTriFactors->cpermIndices) {
692     const PetscInt *c;
693 
694     PetscCall(ISGetIndices(iscol,&c));
695     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
696     cusparseTriFactors->cpermIndices->assign(c, c+n);
697     PetscCall(ISRestoreIndices(iscol,&c));
698     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
699   }
700   PetscFunctionReturn(0);
701 }
702 
703 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
704 {
705   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
706   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
708   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
709   PetscInt                          *AiUp, *AjUp;
710   PetscScalar                       *AAUp;
711   PetscScalar                       *AALo;
712   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
713   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
714   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
715   const MatScalar                   *aa = b->a,*v;
716 
717   PetscFunctionBegin;
718   if (!n) PetscFunctionReturn(0);
719   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
720     try {
721       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
722       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
723       if (!upTriFactor && !loTriFactor) {
724         /* Allocate Space for the upper triangular matrix */
725         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
726         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
727 
728         /* Fill the upper triangular matrix */
729         AiUp[0]=(PetscInt) 0;
730         AiUp[n]=nzUpper;
731         offset = 0;
732         for (i=0; i<n; i++) {
733           /* set the pointers */
734           v  = aa + ai[i];
735           vj = aj + ai[i];
736           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
737 
738           /* first, set the diagonal elements */
739           AjUp[offset] = (PetscInt) i;
740           AAUp[offset] = (MatScalar)1.0/v[nz];
741           AiUp[i]      = offset;
742           AALo[offset] = (MatScalar)1.0/v[nz];
743 
744           offset+=1;
745           if (nz>0) {
746             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
747             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
748             for (j=offset; j<offset+nz; j++) {
749               AAUp[j] = -AAUp[j];
750               AALo[j] = AAUp[j]/v[nz];
751             }
752             offset+=nz;
753           }
754         }
755 
756         /* allocate space for the triangular factor information */
757         PetscCall(PetscNew(&upTriFactor));
758         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
759 
760         /* Create the matrix description */
761         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
762         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
763        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
764         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
765        #else
766         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
767        #endif
768         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
769         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
770 
771         /* set the matrix */
772         upTriFactor->csrMat = new CsrMatrix;
773         upTriFactor->csrMat->num_rows = A->rmap->n;
774         upTriFactor->csrMat->num_cols = A->cmap->n;
775         upTriFactor->csrMat->num_entries = a->nz;
776 
777         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
778         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
779 
780         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
781         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
782 
783         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
784         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
785 
786         /* set the operation */
787         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
788 
789         /* Create the solve analysis information */
790         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
791         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
792       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
793         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
794                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
795                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
796                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
797                                                &upTriFactor->solveBufferSize));
798         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
799       #endif
800 
801         /* perform the solve analysis */
802         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
803                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
804                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
805                                          upTriFactor->csrMat->column_indices->data().get(),
806                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
807                                          upTriFactor->solveInfo,
808                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
809                                          #else
810                                          upTriFactor->solveInfo));
811                                          #endif
812         PetscCallCUDA(WaitForCUDA());
813         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
814 
815         /* assign the pointer */
816         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
817 
818         /* allocate space for the triangular factor information */
819         PetscCall(PetscNew(&loTriFactor));
820         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
821 
822         /* Create the matrix description */
823         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
824         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
825        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
826         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
827        #else
828         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
829        #endif
830         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
831         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
832 
833         /* set the operation */
834         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
835 
836         /* set the matrix */
837         loTriFactor->csrMat = new CsrMatrix;
838         loTriFactor->csrMat->num_rows = A->rmap->n;
839         loTriFactor->csrMat->num_cols = A->cmap->n;
840         loTriFactor->csrMat->num_entries = a->nz;
841 
842         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
843         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
844 
845         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
846         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
847 
848         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
849         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
850 
851         /* Create the solve analysis information */
852         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
853         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
854       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
855         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
856                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
857                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
858                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
859                                                &loTriFactor->solveBufferSize));
860         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
861       #endif
862 
863         /* perform the solve analysis */
864         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
865                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
866                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
867                                          loTriFactor->csrMat->column_indices->data().get(),
868                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
869                                          loTriFactor->solveInfo,
870                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
871                                          #else
872                                          loTriFactor->solveInfo));
873                                          #endif
874         PetscCallCUDA(WaitForCUDA());
875         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
876 
877         /* assign the pointer */
878         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
879 
880         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
881         PetscCallCUDA(cudaFreeHost(AiUp));
882         PetscCallCUDA(cudaFreeHost(AjUp));
883       } else {
884         /* Fill the upper triangular matrix */
885         offset = 0;
886         for (i=0; i<n; i++) {
887           /* set the pointers */
888           v  = aa + ai[i];
889           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
890 
891           /* first, set the diagonal elements */
892           AAUp[offset] = 1.0/v[nz];
893           AALo[offset] = 1.0/v[nz];
894 
895           offset+=1;
896           if (nz>0) {
897             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
898             for (j=offset; j<offset+nz; j++) {
899               AAUp[j] = -AAUp[j];
900               AALo[j] = AAUp[j]/v[nz];
901             }
902             offset+=nz;
903           }
904         }
905         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
906         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
908         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
909         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
910       }
911       PetscCallCUDA(cudaFreeHost(AAUp));
912       PetscCallCUDA(cudaFreeHost(AALo));
913     } catch(char *ex) {
914       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
915     }
916   }
917   PetscFunctionReturn(0);
918 }
919 
920 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
921 {
922   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
923   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
924   IS                           ip = a->row;
925   PetscBool                    perm_identity;
926   PetscInt                     n = A->rmap->n;
927 
928   PetscFunctionBegin;
929   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
930   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
931   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
932   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
933 
934   A->offloadmask = PETSC_OFFLOAD_BOTH;
935 
936   /* lower triangular indices */
937   PetscCall(ISIdentity(ip,&perm_identity));
938   if (!perm_identity) {
939     IS             iip;
940     const PetscInt *irip,*rip;
941 
942     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
943     PetscCall(ISGetIndices(iip,&irip));
944     PetscCall(ISGetIndices(ip,&rip));
945     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
946     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
947     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
948     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
949     PetscCall(ISRestoreIndices(iip,&irip));
950     PetscCall(ISDestroy(&iip));
951     PetscCall(ISRestoreIndices(ip,&rip));
952     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
953   }
954   PetscFunctionReturn(0);
955 }
956 
957 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
958 {
959   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
960   IS             ip = b->row;
961   PetscBool      perm_identity;
962 
963   PetscFunctionBegin;
964   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
965   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
966   B->offloadmask = PETSC_OFFLOAD_CPU;
967   /* determine which version of MatSolve needs to be used. */
968   PetscCall(ISIdentity(ip,&perm_identity));
969   if (perm_identity) {
970     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
971     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
972     B->ops->matsolve = NULL;
973     B->ops->matsolvetranspose = NULL;
974   } else {
975     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
976     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
977     B->ops->matsolve = NULL;
978     B->ops->matsolvetranspose = NULL;
979   }
980 
981   /* get the triangular factors */
982   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
983   PetscFunctionReturn(0);
984 }
985 
986 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
987 {
988   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
989   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
990   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
991   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
992   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
993   cusparseIndexBase_t               indexBase;
994   cusparseMatrixType_t              matrixType;
995   cusparseFillMode_t                fillMode;
996   cusparseDiagType_t                diagType;
997 
998   PetscFunctionBegin;
999   /* allocate space for the transpose of the lower triangular factor */
1000   PetscCall(PetscNew(&loTriFactorT));
1001   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002 
1003   /* set the matrix descriptors of the lower triangular factor */
1004   matrixType = cusparseGetMatType(loTriFactor->descr);
1005   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009 
1010   /* Create the matrix description */
1011   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1012   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1013   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1014   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1015   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1016 
1017   /* set the operation */
1018   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019 
1020   /* allocate GPU space for the CSC of the lower triangular factor*/
1021   loTriFactorT->csrMat = new CsrMatrix;
1022   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028 
1029   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033                                                loTriFactor->csrMat->values->data().get(),
1034                                                loTriFactor->csrMat->row_offsets->data().get(),
1035                                                loTriFactor->csrMat->column_indices->data().get(),
1036                                                loTriFactorT->csrMat->values->data().get(),
1037                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1039                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1040   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1041 #endif
1042 
1043   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1044   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046                                   loTriFactor->csrMat->values->data().get(),
1047                                   loTriFactor->csrMat->row_offsets->data().get(),
1048                                   loTriFactor->csrMat->column_indices->data().get(),
1049                                   loTriFactorT->csrMat->values->data().get(),
1050                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1053                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1054                                   #else
1055                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1057                                   #endif
1058   PetscCallCUDA(WaitForCUDA());
1059   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1060 
1061   /* Create the solve analysis information */
1062   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1063   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1065   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1069                                          &loTriFactorT->solveBufferSize));
1070   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1071 #endif
1072 
1073   /* perform the solve analysis */
1074   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077                                    loTriFactorT->csrMat->column_indices->data().get(),
1078                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079                                    loTriFactorT->solveInfo,
1080                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1081                                    #else
1082                                    loTriFactorT->solveInfo));
1083                                    #endif
1084   PetscCallCUDA(WaitForCUDA());
1085   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1086 
1087   /* assign the pointer */
1088   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1089 
1090   /*********************************************/
1091   /* Now the Transpose of the Upper Tri Factor */
1092   /*********************************************/
1093 
1094   /* allocate space for the transpose of the upper triangular factor */
1095   PetscCall(PetscNew(&upTriFactorT));
1096   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1097 
1098   /* set the matrix descriptors of the upper triangular factor */
1099   matrixType = cusparseGetMatType(upTriFactor->descr);
1100   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1101   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1102     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1103   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1104 
1105   /* Create the matrix description */
1106   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1107   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1108   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1109   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1110   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1111 
1112   /* set the operation */
1113   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1114 
1115   /* allocate GPU space for the CSC of the upper triangular factor*/
1116   upTriFactorT->csrMat = new CsrMatrix;
1117   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1118   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1119   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1120   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1121   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1122   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1123 
1124   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1125 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1126   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1127                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1128                                                upTriFactor->csrMat->values->data().get(),
1129                                                upTriFactor->csrMat->row_offsets->data().get(),
1130                                                upTriFactor->csrMat->column_indices->data().get(),
1131                                                upTriFactorT->csrMat->values->data().get(),
1132                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1133                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1134                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1135   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1136 #endif
1137 
1138   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1139   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1140                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141                                   upTriFactor->csrMat->values->data().get(),
1142                                   upTriFactor->csrMat->row_offsets->data().get(),
1143                                   upTriFactor->csrMat->column_indices->data().get(),
1144                                   upTriFactorT->csrMat->values->data().get(),
1145                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1146                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1147                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1148                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1149                                   #else
1150                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1151                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1152                                  #endif
1153 
1154   PetscCallCUDA(WaitForCUDA());
1155   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1156 
1157   /* Create the solve analysis information */
1158   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1159   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1160   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1161   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1162                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1163                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1165                                          &upTriFactorT->solveBufferSize));
1166   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1167   #endif
1168 
1169   /* perform the solve analysis */
1170   /* christ, would it have killed you to put this stuff in a function????????? */
1171   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174                                    upTriFactorT->csrMat->column_indices->data().get(),
1175                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1176                                    upTriFactorT->solveInfo,
1177                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1178                                    #else
1179                                    upTriFactorT->solveInfo));
1180                                    #endif
1181 
1182   PetscCallCUDA(WaitForCUDA());
1183   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1184 
1185   /* assign the pointer */
1186   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 struct PetscScalarToPetscInt
1191 {
1192   __host__ __device__
1193   PetscInt operator()(PetscScalar s)
1194   {
1195     return (PetscInt)PetscRealPart(s);
1196   }
1197 };
1198 
1199 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1200 {
1201   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1203   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1204   cusparseStatus_t             stat;
1205   cusparseIndexBase_t          indexBase;
1206 
1207   PetscFunctionBegin;
1208   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1209   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1210   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1211   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1212   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1213   if (A->transupdated) PetscFunctionReturn(0);
1214   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1215   PetscCall(PetscLogGpuTimeBegin());
1216   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1217     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1218   }
1219   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1220     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1221     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1222     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1223     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1224     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1225 
1226     /* set alpha and beta */
1227     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1228     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1229     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1230     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1231     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1232     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1233 
1234     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1235       CsrMatrix *matrixT = new CsrMatrix;
1236       matstructT->mat = matrixT;
1237       matrixT->num_rows = A->cmap->n;
1238       matrixT->num_cols = A->rmap->n;
1239       matrixT->num_entries = a->nz;
1240       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1241       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1242       matrixT->values = new THRUSTARRAY(a->nz);
1243 
1244       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1245       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1246 
1247      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1248       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1249         stat = cusparseCreateCsr(&matstructT->matDescr,
1250                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1251                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1252                                matrixT->values->data().get(),
1253                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1254                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1255       #else
1256         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1257            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1258 
1259            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1260            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1261            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1262         */
1263         if (matrixT->num_entries) {
1264           stat = cusparseCreateCsr(&matstructT->matDescr,
1265                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1266                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1267                                  matrixT->values->data().get(),
1268                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1269                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1270 
1271         } else {
1272           matstructT->matDescr = NULL;
1273           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1274         }
1275       #endif
1276      #endif
1277     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1278    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1279       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1280    #else
1281       CsrMatrix *temp  = new CsrMatrix;
1282       CsrMatrix *tempT = new CsrMatrix;
1283       /* First convert HYB to CSR */
1284       temp->num_rows = A->rmap->n;
1285       temp->num_cols = A->cmap->n;
1286       temp->num_entries = a->nz;
1287       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1288       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1289       temp->values = new THRUSTARRAY(a->nz);
1290 
1291       stat = cusparse_hyb2csr(cusparsestruct->handle,
1292                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1293                               temp->values->data().get(),
1294                               temp->row_offsets->data().get(),
1295                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1296 
1297       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1298       tempT->num_rows = A->rmap->n;
1299       tempT->num_cols = A->cmap->n;
1300       tempT->num_entries = a->nz;
1301       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1302       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1303       tempT->values = new THRUSTARRAY(a->nz);
1304 
1305       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1306                               temp->num_cols, temp->num_entries,
1307                               temp->values->data().get(),
1308                               temp->row_offsets->data().get(),
1309                               temp->column_indices->data().get(),
1310                               tempT->values->data().get(),
1311                               tempT->column_indices->data().get(),
1312                               tempT->row_offsets->data().get(),
1313                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1314 
1315       /* Last, convert CSC to HYB */
1316       cusparseHybMat_t hybMat;
1317       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1321                               matstructT->descr, tempT->values->data().get(),
1322                               tempT->row_offsets->data().get(),
1323                               tempT->column_indices->data().get(),
1324                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1325 
1326       /* assign the pointer */
1327       matstructT->mat = hybMat;
1328       A->transupdated = PETSC_TRUE;
1329       /* delete temporaries */
1330       if (tempT) {
1331         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1332         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1333         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1334         delete (CsrMatrix*) tempT;
1335       }
1336       if (temp) {
1337         if (temp->values) delete (THRUSTARRAY*) temp->values;
1338         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1339         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1340         delete (CsrMatrix*) temp;
1341       }
1342      #endif
1343     }
1344   }
1345   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1346     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1347     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1348     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1349     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1350     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1351     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1352     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1353     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1354     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1355     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1356     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1357       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1358       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1359       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1360     }
1361     if (!cusparsestruct->csr2csc_i) {
1362       THRUSTARRAY csr2csc_a(matrix->num_entries);
1363       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1364 
1365       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1366      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1367       void   *csr2cscBuffer;
1368       size_t csr2cscBufferSize;
1369       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1370                                            A->cmap->n, matrix->num_entries,
1371                                            matrix->values->data().get(),
1372                                            cusparsestruct->rowoffsets_gpu->data().get(),
1373                                            matrix->column_indices->data().get(),
1374                                            matrixT->values->data().get(),
1375                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1376                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1377                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1378       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1379      #endif
1380 
1381       if (matrix->num_entries) {
1382         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1383            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1384            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1385 
1386            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1387            should be filled with indexBase. So I just take a shortcut here.
1388         */
1389         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1390                               A->cmap->n,matrix->num_entries,
1391                               csr2csc_a.data().get(),
1392                               cusparsestruct->rowoffsets_gpu->data().get(),
1393                               matrix->column_indices->data().get(),
1394                               matrixT->values->data().get(),
1395                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1396                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1397                               CUSPARSE_ACTION_NUMERIC,indexBase,
1398                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1399                              #else
1400                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1401                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1402                              #endif
1403       } else {
1404         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1405       }
1406 
1407       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1408       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1409      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1410       PetscCallCUDA(cudaFree(csr2cscBuffer));
1411      #endif
1412     }
1413     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1414                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1415                                                      matrixT->values->begin()));
1416   }
1417   PetscCall(PetscLogGpuTimeEnd());
1418   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1419   /* the compressed row indices is not used for matTranspose */
1420   matstructT->cprowIndices = NULL;
1421   /* assign the pointer */
1422   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1423   A->transupdated = PETSC_TRUE;
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1428 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1429 {
1430   PetscInt                              n = xx->map->n;
1431   const PetscScalar                     *barray;
1432   PetscScalar                           *xarray;
1433   thrust::device_ptr<const PetscScalar> bGPU;
1434   thrust::device_ptr<PetscScalar>       xGPU;
1435   cusparseStatus_t                      stat;
1436   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1437   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1438   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1439   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1440 
1441   PetscFunctionBegin;
1442   /* Analyze the matrix and create the transpose ... on the fly */
1443   if (!loTriFactorT && !upTriFactorT) {
1444     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1445     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1446     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1447   }
1448 
1449   /* Get the GPU pointers */
1450   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1451   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1452   xGPU = thrust::device_pointer_cast(xarray);
1453   bGPU = thrust::device_pointer_cast(barray);
1454 
1455   PetscCall(PetscLogGpuTimeBegin());
1456   /* First, reorder with the row permutation */
1457   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1458                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1459                xGPU);
1460 
1461   /* First, solve U */
1462   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1463                         upTriFactorT->csrMat->num_rows,
1464                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1465                         upTriFactorT->csrMat->num_entries,
1466                       #endif
1467                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1468                         upTriFactorT->csrMat->values->data().get(),
1469                         upTriFactorT->csrMat->row_offsets->data().get(),
1470                         upTriFactorT->csrMat->column_indices->data().get(),
1471                         upTriFactorT->solveInfo,
1472                         xarray,
1473                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1474                         tempGPU->data().get(),
1475                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1476                       #else
1477                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1478                       #endif
1479 
1480   /* Then, solve L */
1481   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1482                         loTriFactorT->csrMat->num_rows,
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         loTriFactorT->csrMat->num_entries,
1485                       #endif
1486                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1487                         loTriFactorT->csrMat->values->data().get(),
1488                         loTriFactorT->csrMat->row_offsets->data().get(),
1489                         loTriFactorT->csrMat->column_indices->data().get(),
1490                         loTriFactorT->solveInfo,
1491                         tempGPU->data().get(),
1492                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1493                         xarray,
1494                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1495                       #else
1496                          xarray);PetscCallCUSPARSE(stat);
1497                       #endif
1498 
1499   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1500   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1501                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1502                tempGPU->begin());
1503 
1504   /* Copy the temporary to the full solution. */
1505   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1506 
1507   /* restore */
1508   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1509   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1510   PetscCall(PetscLogGpuTimeEnd());
1511   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1516 {
1517   const PetscScalar                 *barray;
1518   PetscScalar                       *xarray;
1519   cusparseStatus_t                  stat;
1520   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1521   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1522   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1523   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1524 
1525   PetscFunctionBegin;
1526   /* Analyze the matrix and create the transpose ... on the fly */
1527   if (!loTriFactorT && !upTriFactorT) {
1528     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1529     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1530     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1531   }
1532 
1533   /* Get the GPU pointers */
1534   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1535   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1536 
1537   PetscCall(PetscLogGpuTimeBegin());
1538   /* First, solve U */
1539   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1540                         upTriFactorT->csrMat->num_rows,
1541                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1542                         upTriFactorT->csrMat->num_entries,
1543                       #endif
1544                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1545                         upTriFactorT->csrMat->values->data().get(),
1546                         upTriFactorT->csrMat->row_offsets->data().get(),
1547                         upTriFactorT->csrMat->column_indices->data().get(),
1548                         upTriFactorT->solveInfo,
1549                         barray,
1550                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1551                         tempGPU->data().get(),
1552                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1553                       #else
1554                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1555                       #endif
1556 
1557   /* Then, solve L */
1558   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1559                         loTriFactorT->csrMat->num_rows,
1560                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1561                         loTriFactorT->csrMat->num_entries,
1562                       #endif
1563                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1564                         loTriFactorT->csrMat->values->data().get(),
1565                         loTriFactorT->csrMat->row_offsets->data().get(),
1566                         loTriFactorT->csrMat->column_indices->data().get(),
1567                         loTriFactorT->solveInfo,
1568                         tempGPU->data().get(),
1569                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1570                         xarray,
1571                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1572                       #else
1573                         xarray);PetscCallCUSPARSE(stat);
1574                       #endif
1575 
1576   /* restore */
1577   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1578   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1579   PetscCall(PetscLogGpuTimeEnd());
1580   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1585 {
1586   const PetscScalar                     *barray;
1587   PetscScalar                           *xarray;
1588   thrust::device_ptr<const PetscScalar> bGPU;
1589   thrust::device_ptr<PetscScalar>       xGPU;
1590   cusparseStatus_t                      stat;
1591   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595 
1596   PetscFunctionBegin;
1597 
1598   /* Get the GPU pointers */
1599   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1600   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1601   xGPU = thrust::device_pointer_cast(xarray);
1602   bGPU = thrust::device_pointer_cast(barray);
1603 
1604   PetscCall(PetscLogGpuTimeBegin());
1605   /* First, reorder with the row permutation */
1606   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1607                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1608                tempGPU->begin());
1609 
1610   /* Next, solve L */
1611   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1612                         loTriFactor->csrMat->num_rows,
1613                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614                         loTriFactor->csrMat->num_entries,
1615                       #endif
1616                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1617                         loTriFactor->csrMat->values->data().get(),
1618                         loTriFactor->csrMat->row_offsets->data().get(),
1619                         loTriFactor->csrMat->column_indices->data().get(),
1620                         loTriFactor->solveInfo,
1621                         tempGPU->data().get(),
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                          xarray,
1624                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1625                       #else
1626                          xarray);PetscCallCUSPARSE(stat);
1627                       #endif
1628 
1629   /* Then, solve U */
1630   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1631                         upTriFactor->csrMat->num_rows,
1632                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633                         upTriFactor->csrMat->num_entries,
1634                       #endif
1635                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1636                         upTriFactor->csrMat->values->data().get(),
1637                         upTriFactor->csrMat->row_offsets->data().get(),
1638                         upTriFactor->csrMat->column_indices->data().get(),
1639                         upTriFactor->solveInfo,xarray,
1640                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1641                         tempGPU->data().get(),
1642                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1643                       #else
1644                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1645                       #endif
1646 
1647   /* Last, reorder with the column permutation */
1648   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1649                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1650                xGPU);
1651 
1652   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1653   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1660 {
1661   const PetscScalar                 *barray;
1662   PetscScalar                       *xarray;
1663   cusparseStatus_t                  stat;
1664   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1665   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1667   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1668 
1669   PetscFunctionBegin;
1670   /* Get the GPU pointers */
1671   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1672   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1673 
1674   PetscCall(PetscLogGpuTimeBegin());
1675   /* First, solve L */
1676   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1677                         loTriFactor->csrMat->num_rows,
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         loTriFactor->csrMat->num_entries,
1680                       #endif
1681                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1682                         loTriFactor->csrMat->values->data().get(),
1683                         loTriFactor->csrMat->row_offsets->data().get(),
1684                         loTriFactor->csrMat->column_indices->data().get(),
1685                         loTriFactor->solveInfo,
1686                         barray,
1687                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1688                         tempGPU->data().get(),
1689                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1690                       #else
1691                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1692                       #endif
1693 
1694   /* Next, solve U */
1695   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1696                         upTriFactor->csrMat->num_rows,
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         upTriFactor->csrMat->num_entries,
1699                       #endif
1700                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1701                         upTriFactor->csrMat->values->data().get(),
1702                         upTriFactor->csrMat->row_offsets->data().get(),
1703                         upTriFactor->csrMat->column_indices->data().get(),
1704                         upTriFactor->solveInfo,
1705                         tempGPU->data().get(),
1706                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1707                         xarray,
1708                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1709                       #else
1710                         xarray);PetscCallCUSPARSE(stat);
1711                       #endif
1712 
1713   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1714   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1715   PetscCall(PetscLogGpuTimeEnd());
1716   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1721 {
1722   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1723   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1724 
1725   PetscFunctionBegin;
1726   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1727     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1728 
1729     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1730     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1731     PetscCallCUDA(WaitForCUDA());
1732     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1733     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1734     A->offloadmask = PETSC_OFFLOAD_BOTH;
1735   }
1736   PetscFunctionReturn(0);
1737 }
1738 
1739 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1740 {
1741   PetscFunctionBegin;
1742   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1743   *array = ((Mat_SeqAIJ*)A->data)->a;
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1748 {
1749   PetscFunctionBegin;
1750   A->offloadmask = PETSC_OFFLOAD_CPU;
1751   *array         = NULL;
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1756 {
1757   PetscFunctionBegin;
1758   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1759   *array = ((Mat_SeqAIJ*)A->data)->a;
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1764 {
1765   PetscFunctionBegin;
1766   *array = NULL;
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1771 {
1772   PetscFunctionBegin;
1773   *array = ((Mat_SeqAIJ*)A->data)->a;
1774   PetscFunctionReturn(0);
1775 }
1776 
1777 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1778 {
1779   PetscFunctionBegin;
1780   A->offloadmask = PETSC_OFFLOAD_CPU;
1781   *array         = NULL;
1782   PetscFunctionReturn(0);
1783 }
1784 
1785 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1786 {
1787   Mat_SeqAIJCUSPARSE           *cusp;
1788   CsrMatrix                    *matrix;
1789 
1790   PetscFunctionBegin;
1791   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1792   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1793   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1794   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
1795   matrix = (CsrMatrix*)cusp->mat->mat;
1796 
1797   if (i) {
1798    #if !defined(PETSC_USE_64BIT_INDICES)
1799     *i = matrix->row_offsets->data().get();
1800    #else
1801     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1802    #endif
1803   }
1804   if (j) {
1805    #if !defined(PETSC_USE_64BIT_INDICES)
1806     *j = matrix->column_indices->data().get();
1807    #else
1808     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1809    #endif
1810   }
1811   if (a) *a = matrix->values->data().get();
1812   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1817 {
1818   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1819   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1820   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1821   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1822   cusparseStatus_t             stat;
1823   PetscBool                    both = PETSC_TRUE;
1824 
1825   PetscFunctionBegin;
1826   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1827   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1828     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1829       CsrMatrix *matrix;
1830       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1831 
1832       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1833       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1834       matrix->values->assign(a->a, a->a+a->nz);
1835       PetscCallCUDA(WaitForCUDA());
1836       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1837       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1838       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1839     } else {
1840       PetscInt nnz;
1841       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1842       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1843       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1844       delete cusparsestruct->workVector;
1845       delete cusparsestruct->rowoffsets_gpu;
1846       cusparsestruct->workVector = NULL;
1847       cusparsestruct->rowoffsets_gpu = NULL;
1848       try {
1849         if (a->compressedrow.use) {
1850           m    = a->compressedrow.nrows;
1851           ii   = a->compressedrow.i;
1852           ridx = a->compressedrow.rindex;
1853         } else {
1854           m    = A->rmap->n;
1855           ii   = a->i;
1856           ridx = NULL;
1857         }
1858         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1859         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1860         else nnz = a->nz;
1861         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1862 
1863         /* create cusparse matrix */
1864         cusparsestruct->nrows = m;
1865         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1866         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1867         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1868         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1869 
1870         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1871         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1872         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1873         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1874         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1875         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1876         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1877 
1878         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1879         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1880           /* set the matrix */
1881           CsrMatrix *mat= new CsrMatrix;
1882           mat->num_rows = m;
1883           mat->num_cols = A->cmap->n;
1884           mat->num_entries = nnz;
1885           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1886           mat->row_offsets->assign(ii, ii + m+1);
1887 
1888           mat->column_indices = new THRUSTINTARRAY32(nnz);
1889           mat->column_indices->assign(a->j, a->j+nnz);
1890 
1891           mat->values = new THRUSTARRAY(nnz);
1892           if (a->a) mat->values->assign(a->a, a->a+nnz);
1893 
1894           /* assign the pointer */
1895           matstruct->mat = mat;
1896          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1897           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1898             stat = cusparseCreateCsr(&matstruct->matDescr,
1899                                     mat->num_rows, mat->num_cols, mat->num_entries,
1900                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1901                                     mat->values->data().get(),
1902                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1903                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1904           }
1905          #endif
1906         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1907          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1908           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1909          #else
1910           CsrMatrix *mat= new CsrMatrix;
1911           mat->num_rows = m;
1912           mat->num_cols = A->cmap->n;
1913           mat->num_entries = nnz;
1914           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1915           mat->row_offsets->assign(ii, ii + m+1);
1916 
1917           mat->column_indices = new THRUSTINTARRAY32(nnz);
1918           mat->column_indices->assign(a->j, a->j+nnz);
1919 
1920           mat->values = new THRUSTARRAY(nnz);
1921           if (a->a) mat->values->assign(a->a, a->a+nnz);
1922 
1923           cusparseHybMat_t hybMat;
1924           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1925           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1926             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1927           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1928               matstruct->descr, mat->values->data().get(),
1929               mat->row_offsets->data().get(),
1930               mat->column_indices->data().get(),
1931               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1932           /* assign the pointer */
1933           matstruct->mat = hybMat;
1934 
1935           if (mat) {
1936             if (mat->values) delete (THRUSTARRAY*)mat->values;
1937             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1938             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1939             delete (CsrMatrix*)mat;
1940           }
1941          #endif
1942         }
1943 
1944         /* assign the compressed row indices */
1945         if (a->compressedrow.use) {
1946           cusparsestruct->workVector = new THRUSTARRAY(m);
1947           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1948           matstruct->cprowIndices->assign(ridx,ridx+m);
1949           tmp = m;
1950         } else {
1951           cusparsestruct->workVector = NULL;
1952           matstruct->cprowIndices    = NULL;
1953           tmp = 0;
1954         }
1955         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1956 
1957         /* assign the pointer */
1958         cusparsestruct->mat = matstruct;
1959       } catch(char *ex) {
1960         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1961       }
1962       PetscCallCUDA(WaitForCUDA());
1963       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1964       cusparsestruct->nonzerostate = A->nonzerostate;
1965     }
1966     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1967   }
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 struct VecCUDAPlusEquals
1972 {
1973   template <typename Tuple>
1974   __host__ __device__
1975   void operator()(Tuple t)
1976   {
1977     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1978   }
1979 };
1980 
1981 struct VecCUDAEquals
1982 {
1983   template <typename Tuple>
1984   __host__ __device__
1985   void operator()(Tuple t)
1986   {
1987     thrust::get<1>(t) = thrust::get<0>(t);
1988   }
1989 };
1990 
1991 struct VecCUDAEqualsReverse
1992 {
1993   template <typename Tuple>
1994   __host__ __device__
1995   void operator()(Tuple t)
1996   {
1997     thrust::get<0>(t) = thrust::get<1>(t);
1998   }
1999 };
2000 
2001 struct MatMatCusparse {
2002   PetscBool             cisdense;
2003   PetscScalar           *Bt;
2004   Mat                   X;
2005   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2006   PetscLogDouble        flops;
2007   CsrMatrix             *Bcsr;
2008 
2009 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2010   cusparseSpMatDescr_t  matSpBDescr;
2011   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2012   cusparseDnMatDescr_t  matBDescr;
2013   cusparseDnMatDescr_t  matCDescr;
2014   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2015  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2016   void                  *dBuffer4;
2017   void                  *dBuffer5;
2018  #endif
2019   size_t                mmBufferSize;
2020   void                  *mmBuffer;
2021   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2022   cusparseSpGEMMDescr_t spgemmDesc;
2023 #endif
2024 };
2025 
2026 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2027 {
2028   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2029 
2030   PetscFunctionBegin;
2031   PetscCallCUDA(cudaFree(mmdata->Bt));
2032   delete mmdata->Bcsr;
2033  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2034   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2035   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2036   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2037   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2038  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2039   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2040   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2041  #endif
2042   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2043   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2044  #endif
2045   PetscCall(MatDestroy(&mmdata->X));
2046   PetscCall(PetscFree(data));
2047   PetscFunctionReturn(0);
2048 }
2049 
2050 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2051 
2052 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2053 {
2054   Mat_Product                  *product = C->product;
2055   Mat                          A,B;
2056   PetscInt                     m,n,blda,clda;
2057   PetscBool                    flg,biscuda;
2058   Mat_SeqAIJCUSPARSE           *cusp;
2059   cusparseStatus_t             stat;
2060   cusparseOperation_t          opA;
2061   const PetscScalar            *barray;
2062   PetscScalar                  *carray;
2063   MatMatCusparse               *mmdata;
2064   Mat_SeqAIJCUSPARSEMultStruct *mat;
2065   CsrMatrix                    *csrmat;
2066 
2067   PetscFunctionBegin;
2068   MatCheckProduct(C,1);
2069   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2070   mmdata = (MatMatCusparse*)product->data;
2071   A    = product->A;
2072   B    = product->B;
2073   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2074   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2075   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2076      Instead of silently accepting the wrong answer, I prefer to raise the error */
2077   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2078   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2079   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080   switch (product->type) {
2081   case MATPRODUCT_AB:
2082   case MATPRODUCT_PtAP:
2083     mat = cusp->mat;
2084     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2085     m   = A->rmap->n;
2086     n   = B->cmap->n;
2087     break;
2088   case MATPRODUCT_AtB:
2089     if (!A->form_explicit_transpose) {
2090       mat = cusp->mat;
2091       opA = CUSPARSE_OPERATION_TRANSPOSE;
2092     } else {
2093       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2094       mat  = cusp->matTranspose;
2095       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2096     }
2097     m = A->cmap->n;
2098     n = B->cmap->n;
2099     break;
2100   case MATPRODUCT_ABt:
2101   case MATPRODUCT_RARt:
2102     mat = cusp->mat;
2103     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2104     m   = A->rmap->n;
2105     n   = B->rmap->n;
2106     break;
2107   default:
2108     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2109   }
2110   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2111   csrmat = (CsrMatrix*)mat->mat;
2112   /* if the user passed a CPU matrix, copy the data to the GPU */
2113   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2114   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2115   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2116 
2117   PetscCall(MatDenseGetLDA(B,&blda));
2118   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2119     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2120     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2121   } else {
2122     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2123     PetscCall(MatDenseGetLDA(C,&clda));
2124   }
2125 
2126   PetscCall(PetscLogGpuTimeBegin());
2127  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2129   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2130   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2131     size_t mmBufferSize;
2132     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2133     if (!mmdata->matBDescr) {
2134       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2135       mmdata->Blda = blda;
2136     }
2137 
2138     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2139     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2140       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2141       mmdata->Clda = clda;
2142     }
2143 
2144     if (!mat->matDescr) {
2145       stat = cusparseCreateCsr(&mat->matDescr,
2146                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2147                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2148                                csrmat->values->data().get(),
2149                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2150                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2151     }
2152     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2153                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2154                                    mmdata->matCDescr,cusparse_scalartype,
2155                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2156     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2157       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2158       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2159       mmdata->mmBufferSize = mmBufferSize;
2160     }
2161     mmdata->initialized = PETSC_TRUE;
2162   } else {
2163     /* to be safe, always update pointers of the mats */
2164     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2165     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2166     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2167   }
2168 
2169   /* do cusparseSpMM, which supports transpose on B */
2170   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2171                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2172                       mmdata->matCDescr,cusparse_scalartype,
2173                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2174  #else
2175   PetscInt k;
2176   /* cusparseXcsrmm does not support transpose on B */
2177   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2178     cublasHandle_t cublasv2handle;
2179     cublasStatus_t cerr;
2180 
2181     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2182     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2183                        B->cmap->n,B->rmap->n,
2184                        &PETSC_CUSPARSE_ONE ,barray,blda,
2185                        &PETSC_CUSPARSE_ZERO,barray,blda,
2186                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2187     blda = B->cmap->n;
2188     k    = B->cmap->n;
2189   } else {
2190     k    = B->rmap->n;
2191   }
2192 
2193   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2194   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2195                            csrmat->num_entries,mat->alpha_one,mat->descr,
2196                            csrmat->values->data().get(),
2197                            csrmat->row_offsets->data().get(),
2198                            csrmat->column_indices->data().get(),
2199                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2200                            carray,clda);PetscCallCUSPARSE(stat);
2201  #endif
2202   PetscCall(PetscLogGpuTimeEnd());
2203   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2204   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2205   if (product->type == MATPRODUCT_RARt) {
2206     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2207     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2208   } else if (product->type == MATPRODUCT_PtAP) {
2209     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2210     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2211   } else {
2212     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2213   }
2214   if (mmdata->cisdense) {
2215     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2216   }
2217   if (!biscuda) {
2218     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2219   }
2220   PetscFunctionReturn(0);
2221 }
2222 
2223 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2224 {
2225   Mat_Product        *product = C->product;
2226   Mat                A,B;
2227   PetscInt           m,n;
2228   PetscBool          cisdense,flg;
2229   MatMatCusparse     *mmdata;
2230   Mat_SeqAIJCUSPARSE *cusp;
2231 
2232   PetscFunctionBegin;
2233   MatCheckProduct(C,1);
2234   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2235   A    = product->A;
2236   B    = product->B;
2237   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2238   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2239   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2240   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2241   switch (product->type) {
2242   case MATPRODUCT_AB:
2243     m = A->rmap->n;
2244     n = B->cmap->n;
2245     break;
2246   case MATPRODUCT_AtB:
2247     m = A->cmap->n;
2248     n = B->cmap->n;
2249     break;
2250   case MATPRODUCT_ABt:
2251     m = A->rmap->n;
2252     n = B->rmap->n;
2253     break;
2254   case MATPRODUCT_PtAP:
2255     m = B->cmap->n;
2256     n = B->cmap->n;
2257     break;
2258   case MATPRODUCT_RARt:
2259     m = B->rmap->n;
2260     n = B->rmap->n;
2261     break;
2262   default:
2263     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2264   }
2265   PetscCall(MatSetSizes(C,m,n,m,n));
2266   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2267   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2268   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2269 
2270   /* product data */
2271   PetscCall(PetscNew(&mmdata));
2272   mmdata->cisdense = cisdense;
2273  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2274   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2275   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2276     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2277   }
2278  #endif
2279   /* for these products we need intermediate storage */
2280   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2281     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2282     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2283     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2284       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2285     } else {
2286       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2287     }
2288   }
2289   C->product->data    = mmdata;
2290   C->product->destroy = MatDestroy_MatMatCusparse;
2291 
2292   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2293   PetscFunctionReturn(0);
2294 }
2295 
2296 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2297 {
2298   Mat_Product                  *product = C->product;
2299   Mat                          A,B;
2300   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2301   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2302   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2303   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2304   PetscBool                    flg;
2305   cusparseStatus_t             stat;
2306   MatProductType               ptype;
2307   MatMatCusparse               *mmdata;
2308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2309   cusparseSpMatDescr_t         BmatSpDescr;
2310 #endif
2311   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2312 
2313   PetscFunctionBegin;
2314   MatCheckProduct(C,1);
2315   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2316   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2317   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2318   mmdata = (MatMatCusparse*)C->product->data;
2319   A = product->A;
2320   B = product->B;
2321   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2322     mmdata->reusesym = PETSC_FALSE;
2323     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2324     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2325     Cmat = Ccusp->mat;
2326     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2327     Ccsr = (CsrMatrix*)Cmat->mat;
2328     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2329     goto finalize;
2330   }
2331   if (!c->nz) goto finalize;
2332   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2333   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2334   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2335   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2336   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2337   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2338   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2339   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2340   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2341   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2342   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2343   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2344   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2345   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2346 
2347   ptype = product->type;
2348   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2349     ptype = MATPRODUCT_AB;
2350     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2351   }
2352   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2353     ptype = MATPRODUCT_AB;
2354     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2355   }
2356   switch (ptype) {
2357   case MATPRODUCT_AB:
2358     Amat = Acusp->mat;
2359     Bmat = Bcusp->mat;
2360     break;
2361   case MATPRODUCT_AtB:
2362     Amat = Acusp->matTranspose;
2363     Bmat = Bcusp->mat;
2364     break;
2365   case MATPRODUCT_ABt:
2366     Amat = Acusp->mat;
2367     Bmat = Bcusp->matTranspose;
2368     break;
2369   default:
2370     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2371   }
2372   Cmat = Ccusp->mat;
2373   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2374   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2375   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2376   Acsr = (CsrMatrix*)Amat->mat;
2377   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2378   Ccsr = (CsrMatrix*)Cmat->mat;
2379   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2380   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2381   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2382   PetscCall(PetscLogGpuTimeBegin());
2383 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2384   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2385   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2386   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2387     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2388                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2389                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2390                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2391   #else
2392     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2393                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2394                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2395                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2396     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2397                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2398                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2399   #endif
2400 #else
2401   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2402                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2403                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2404                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2405                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2406 #endif
2407   PetscCall(PetscLogGpuFlops(mmdata->flops));
2408   PetscCallCUDA(WaitForCUDA());
2409   PetscCall(PetscLogGpuTimeEnd());
2410   C->offloadmask = PETSC_OFFLOAD_GPU;
2411 finalize:
2412   /* shorter version of MatAssemblyEnd_SeqAIJ */
2413   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2414   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2415   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2416   c->reallocs         = 0;
2417   C->info.mallocs    += 0;
2418   C->info.nz_unneeded = 0;
2419   C->assembled = C->was_assembled = PETSC_TRUE;
2420   C->num_ass++;
2421   PetscFunctionReturn(0);
2422 }
2423 
2424 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2425 {
2426   Mat_Product                  *product = C->product;
2427   Mat                          A,B;
2428   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2429   Mat_SeqAIJ                   *a,*b,*c;
2430   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2431   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2432   PetscInt                     i,j,m,n,k;
2433   PetscBool                    flg;
2434   cusparseStatus_t             stat;
2435   MatProductType               ptype;
2436   MatMatCusparse               *mmdata;
2437   PetscLogDouble               flops;
2438   PetscBool                    biscompressed,ciscompressed;
2439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2440   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2441   cusparseSpMatDescr_t         BmatSpDescr;
2442 #else
2443   int                          cnz;
2444 #endif
2445   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2446 
2447   PetscFunctionBegin;
2448   MatCheckProduct(C,1);
2449   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2450   A    = product->A;
2451   B    = product->B;
2452   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2453   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2454   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2455   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2456   a = (Mat_SeqAIJ*)A->data;
2457   b = (Mat_SeqAIJ*)B->data;
2458   /* product data */
2459   PetscCall(PetscNew(&mmdata));
2460   C->product->data    = mmdata;
2461   C->product->destroy = MatDestroy_MatMatCusparse;
2462 
2463   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2464   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2465   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2466   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2467   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2468   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2469 
2470   ptype = product->type;
2471   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2472     ptype = MATPRODUCT_AB;
2473     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2474   }
2475   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2476     ptype = MATPRODUCT_AB;
2477     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2478   }
2479   biscompressed = PETSC_FALSE;
2480   ciscompressed = PETSC_FALSE;
2481   switch (ptype) {
2482   case MATPRODUCT_AB:
2483     m = A->rmap->n;
2484     n = B->cmap->n;
2485     k = A->cmap->n;
2486     Amat = Acusp->mat;
2487     Bmat = Bcusp->mat;
2488     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2489     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2490     break;
2491   case MATPRODUCT_AtB:
2492     m = A->cmap->n;
2493     n = B->cmap->n;
2494     k = A->rmap->n;
2495     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2496     Amat = Acusp->matTranspose;
2497     Bmat = Bcusp->mat;
2498     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499     break;
2500   case MATPRODUCT_ABt:
2501     m = A->rmap->n;
2502     n = B->rmap->n;
2503     k = A->cmap->n;
2504     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2505     Amat = Acusp->mat;
2506     Bmat = Bcusp->matTranspose;
2507     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2508     break;
2509   default:
2510     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2511   }
2512 
2513   /* create cusparse matrix */
2514   PetscCall(MatSetSizes(C,m,n,m,n));
2515   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2516   c     = (Mat_SeqAIJ*)C->data;
2517   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2518   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2519   Ccsr  = new CsrMatrix;
2520 
2521   c->compressedrow.use = ciscompressed;
2522   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2523     c->compressedrow.nrows = a->compressedrow.nrows;
2524     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2525     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2526     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2527     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2528     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2529   } else {
2530     c->compressedrow.nrows  = 0;
2531     c->compressedrow.i      = NULL;
2532     c->compressedrow.rindex = NULL;
2533     Ccusp->workVector       = NULL;
2534     Cmat->cprowIndices      = NULL;
2535   }
2536   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2537   Ccusp->mat      = Cmat;
2538   Ccusp->mat->mat = Ccsr;
2539   Ccsr->num_rows    = Ccusp->nrows;
2540   Ccsr->num_cols    = n;
2541   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2542   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2543   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2544   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2545   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2546   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2547   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2548   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2549   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2550   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2551   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2552     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2553     c->nz = 0;
2554     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2555     Ccsr->values = new THRUSTARRAY(c->nz);
2556     goto finalizesym;
2557   }
2558 
2559   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2560   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2561   Acsr = (CsrMatrix*)Amat->mat;
2562   if (!biscompressed) {
2563     Bcsr = (CsrMatrix*)Bmat->mat;
2564 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2565     BmatSpDescr = Bmat->matDescr;
2566 #endif
2567   } else { /* we need to use row offsets for the full matrix */
2568     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2569     Bcsr = new CsrMatrix;
2570     Bcsr->num_rows       = B->rmap->n;
2571     Bcsr->num_cols       = cBcsr->num_cols;
2572     Bcsr->num_entries    = cBcsr->num_entries;
2573     Bcsr->column_indices = cBcsr->column_indices;
2574     Bcsr->values         = cBcsr->values;
2575     if (!Bcusp->rowoffsets_gpu) {
2576       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2577       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2578       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2579     }
2580     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2581     mmdata->Bcsr = Bcsr;
2582 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2583     if (Bcsr->num_rows && Bcsr->num_cols) {
2584       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2585                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2586                                Bcsr->values->data().get(),
2587                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2588                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2589     }
2590     BmatSpDescr = mmdata->matSpBDescr;
2591 #endif
2592   }
2593   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2594   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2595   /* precompute flops count */
2596   if (ptype == MATPRODUCT_AB) {
2597     for (i=0, flops = 0; i<A->rmap->n; i++) {
2598       const PetscInt st = a->i[i];
2599       const PetscInt en = a->i[i+1];
2600       for (j=st; j<en; j++) {
2601         const PetscInt brow = a->j[j];
2602         flops += 2.*(b->i[brow+1] - b->i[brow]);
2603       }
2604     }
2605   } else if (ptype == MATPRODUCT_AtB) {
2606     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607       const PetscInt anzi = a->i[i+1] - a->i[i];
2608       const PetscInt bnzi = b->i[i+1] - b->i[i];
2609       flops += (2.*anzi)*bnzi;
2610     }
2611   } else { /* TODO */
2612     flops = 0.;
2613   }
2614 
2615   mmdata->flops = flops;
2616   PetscCall(PetscLogGpuTimeBegin());
2617 
2618 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2619   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2620   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2621                           NULL, NULL, NULL,
2622                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2623                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2624   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2625  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2626  {
2627   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2628      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2629   */
2630   void*  dBuffer1 = NULL;
2631   void*  dBuffer2 = NULL;
2632   void*  dBuffer3 = NULL;
2633   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2634   size_t bufferSize1 = 0;
2635   size_t bufferSize2 = 0;
2636   size_t bufferSize3 = 0;
2637   size_t bufferSize4 = 0;
2638   size_t bufferSize5 = 0;
2639 
2640   /*----------------------------------------------------------------------*/
2641   /* ask bufferSize1 bytes for external memory */
2642   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2645   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2646   /* inspect the matrices A and B to understand the memory requirement for the next step */
2647   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2648                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2649                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2650 
2651   /*----------------------------------------------------------------------*/
2652   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2653                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2654                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2655   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2656   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2657   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2658   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2659                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2660                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2661   PetscCallCUDA(cudaFree(dBuffer1));
2662   PetscCallCUDA(cudaFree(dBuffer2));
2663 
2664   /*----------------------------------------------------------------------*/
2665   /* get matrix C non-zero entries C_nnz1 */
2666   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2667   c->nz = (PetscInt) C_nnz1;
2668   /* allocate matrix C */
2669   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2670   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2671   /* update matC with the new pointers */
2672   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2673                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2674 
2675   /*----------------------------------------------------------------------*/
2676   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2677                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2678                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2679   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2680   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2681                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2682                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2683   PetscCallCUDA(cudaFree(dBuffer3));
2684   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2685                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2686                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2687                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2688   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2689  }
2690  #else
2691   size_t bufSize2;
2692   /* ask bufferSize bytes for external memory */
2693   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2694                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2696                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2697   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2698   /* inspect the matrices A and B to understand the memory requirement for the next step */
2699   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2700                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2701                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2702                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2703   /* ask bufferSize again bytes for external memory */
2704   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2705                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2706                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2707                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2708   /* The CUSPARSE documentation is not clear, nor the API
2709      We need both buffers to perform the operations properly!
2710      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2711      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2712      is stored in the descriptor! What a messy API... */
2713   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2714   /* compute the intermediate product of A * B */
2715   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2716                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2717                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2718                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2719   /* get matrix C non-zero entries C_nnz1 */
2720   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2721   c->nz = (PetscInt) C_nnz1;
2722   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2723   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2724   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2725   Ccsr->values = new THRUSTARRAY(c->nz);
2726   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2727   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2728                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2729   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2730                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2731                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2732  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2733 #else
2734   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2735   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2736                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2737                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2738                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2739                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2740   c->nz = cnz;
2741   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2742   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2743   Ccsr->values = new THRUSTARRAY(c->nz);
2744   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2745 
2746   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2747   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2748      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2749      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2750   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2751                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2752                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2753                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2754                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2755 #endif
2756   PetscCall(PetscLogGpuFlops(mmdata->flops));
2757   PetscCall(PetscLogGpuTimeEnd());
2758 finalizesym:
2759   c->singlemalloc = PETSC_FALSE;
2760   c->free_a       = PETSC_TRUE;
2761   c->free_ij      = PETSC_TRUE;
2762   PetscCall(PetscMalloc1(m+1,&c->i));
2763   PetscCall(PetscMalloc1(c->nz,&c->j));
2764   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2765     PetscInt *d_i = c->i;
2766     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2767     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2768     ii   = *Ccsr->row_offsets;
2769     jj   = *Ccsr->column_indices;
2770     if (ciscompressed) d_i = c->compressedrow.i;
2771     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2772     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2773   } else {
2774     PetscInt *d_i = c->i;
2775     if (ciscompressed) d_i = c->compressedrow.i;
2776     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2777     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2778   }
2779   if (ciscompressed) { /* need to expand host row offsets */
2780     PetscInt r = 0;
2781     c->i[0] = 0;
2782     for (k = 0; k < c->compressedrow.nrows; k++) {
2783       const PetscInt next = c->compressedrow.rindex[k];
2784       const PetscInt old = c->compressedrow.i[k];
2785       for (; r < next; r++) c->i[r+1] = old;
2786     }
2787     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2788   }
2789   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2790   PetscCall(PetscMalloc1(m,&c->ilen));
2791   PetscCall(PetscMalloc1(m,&c->imax));
2792   c->maxnz = c->nz;
2793   c->nonzerorowcnt = 0;
2794   c->rmax = 0;
2795   for (k = 0; k < m; k++) {
2796     const PetscInt nn = c->i[k+1] - c->i[k];
2797     c->ilen[k] = c->imax[k] = nn;
2798     c->nonzerorowcnt += (PetscInt)!!nn;
2799     c->rmax = PetscMax(c->rmax,nn);
2800   }
2801   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2802   PetscCall(PetscMalloc1(c->nz,&c->a));
2803   Ccsr->num_entries = c->nz;
2804 
2805   C->nonzerostate++;
2806   PetscCall(PetscLayoutSetUp(C->rmap));
2807   PetscCall(PetscLayoutSetUp(C->cmap));
2808   Ccusp->nonzerostate = C->nonzerostate;
2809   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2810   C->preallocated  = PETSC_TRUE;
2811   C->assembled     = PETSC_FALSE;
2812   C->was_assembled = PETSC_FALSE;
2813   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2814     mmdata->reusesym = PETSC_TRUE;
2815     C->offloadmask   = PETSC_OFFLOAD_GPU;
2816   }
2817   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2818   PetscFunctionReturn(0);
2819 }
2820 
2821 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2822 
2823 /* handles sparse or dense B */
2824 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2825 {
2826   Mat_Product    *product = mat->product;
2827   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2828 
2829   PetscFunctionBegin;
2830   MatCheckProduct(mat,1);
2831   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2832   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2833     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2834   }
2835   if (product->type == MATPRODUCT_ABC) {
2836     Ciscusp = PETSC_FALSE;
2837     if (!product->C->boundtocpu) {
2838       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2839     }
2840   }
2841   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2842     PetscBool usecpu = PETSC_FALSE;
2843     switch (product->type) {
2844     case MATPRODUCT_AB:
2845       if (product->api_user) {
2846         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2847         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2848         PetscOptionsEnd();
2849       } else {
2850         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2851         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2852         PetscOptionsEnd();
2853       }
2854       break;
2855     case MATPRODUCT_AtB:
2856       if (product->api_user) {
2857         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2858         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2859         PetscOptionsEnd();
2860       } else {
2861         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2862         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2863         PetscOptionsEnd();
2864       }
2865       break;
2866     case MATPRODUCT_PtAP:
2867       if (product->api_user) {
2868         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2869         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2870         PetscOptionsEnd();
2871       } else {
2872         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2873         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2874         PetscOptionsEnd();
2875       }
2876       break;
2877     case MATPRODUCT_RARt:
2878       if (product->api_user) {
2879         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2880         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2881         PetscOptionsEnd();
2882       } else {
2883         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2884         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2885         PetscOptionsEnd();
2886       }
2887       break;
2888     case MATPRODUCT_ABC:
2889       if (product->api_user) {
2890         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2891         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2892         PetscOptionsEnd();
2893       } else {
2894         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2895         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2896         PetscOptionsEnd();
2897       }
2898       break;
2899     default:
2900       break;
2901     }
2902     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2903   }
2904   /* dispatch */
2905   if (isdense) {
2906     switch (product->type) {
2907     case MATPRODUCT_AB:
2908     case MATPRODUCT_AtB:
2909     case MATPRODUCT_ABt:
2910     case MATPRODUCT_PtAP:
2911     case MATPRODUCT_RARt:
2912      if (product->A->boundtocpu) {
2913         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2914       } else {
2915         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2916       }
2917       break;
2918     case MATPRODUCT_ABC:
2919       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2920       break;
2921     default:
2922       break;
2923     }
2924   } else if (Biscusp && Ciscusp) {
2925     switch (product->type) {
2926     case MATPRODUCT_AB:
2927     case MATPRODUCT_AtB:
2928     case MATPRODUCT_ABt:
2929       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2930       break;
2931     case MATPRODUCT_PtAP:
2932     case MATPRODUCT_RARt:
2933     case MATPRODUCT_ABC:
2934       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2935       break;
2936     default:
2937       break;
2938     }
2939   } else { /* fallback for AIJ */
2940     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2941   }
2942   PetscFunctionReturn(0);
2943 }
2944 
2945 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2946 {
2947   PetscFunctionBegin;
2948   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2949   PetscFunctionReturn(0);
2950 }
2951 
2952 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2953 {
2954   PetscFunctionBegin;
2955   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2956   PetscFunctionReturn(0);
2957 }
2958 
2959 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2960 {
2961   PetscFunctionBegin;
2962   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2963   PetscFunctionReturn(0);
2964 }
2965 
2966 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2967 {
2968   PetscFunctionBegin;
2969   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2970   PetscFunctionReturn(0);
2971 }
2972 
2973 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2974 {
2975   PetscFunctionBegin;
2976   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2977   PetscFunctionReturn(0);
2978 }
2979 
2980 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2981 {
2982   int i = blockIdx.x*blockDim.x + threadIdx.x;
2983   if (i < n) y[idx[i]] += x[i];
2984 }
2985 
2986 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2987 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2988 {
2989   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2990   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2991   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2992   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2993   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2994   PetscBool                    compressed;
2995 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2996   PetscInt                     nx,ny;
2997 #endif
2998 
2999   PetscFunctionBegin;
3000   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3001   if (!a->nz) {
3002     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3003     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3004     PetscFunctionReturn(0);
3005   }
3006   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3007   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3008   if (!trans) {
3009     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3010     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3011   } else {
3012     if (herm || !A->form_explicit_transpose) {
3013       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3014       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3015     } else {
3016       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3017       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3018     }
3019   }
3020   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3021   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3022 
3023   try {
3024     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3025     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3026     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3027 
3028     PetscCall(PetscLogGpuTimeBegin());
3029     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3030       /* z = A x + beta y.
3031          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3032          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3033       */
3034       xptr = xarray;
3035       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3036       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3037      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3038       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3039           allocated to accommodate different uses. So we get the length info directly from mat.
3040        */
3041       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043         nx = mat->num_cols;
3044         ny = mat->num_rows;
3045       }
3046      #endif
3047     } else {
3048       /* z = A^T x + beta y
3049          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3050          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3051        */
3052       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3053       dptr = zarray;
3054       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3055       if (compressed) { /* Scatter x to work vector */
3056         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3057         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3058                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3059                          VecCUDAEqualsReverse());
3060       }
3061      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3062       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3063         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3064         nx = mat->num_rows;
3065         ny = mat->num_cols;
3066       }
3067      #endif
3068     }
3069 
3070     /* csr_spmv does y = alpha op(A) x + beta y */
3071     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3072      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3073       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3074       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3075         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3076         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3077         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3078                                                matstruct->matDescr,
3079                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3080                                                matstruct->cuSpMV[opA].vecYDescr,
3081                                                cusparse_scalartype,
3082                                                cusparsestruct->spmvAlg,
3083                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3084         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3085 
3086         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3087       } else {
3088         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3089         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3090         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3091       }
3092 
3093       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3094                                   matstruct->alpha_one,
3095                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3096                                   matstruct->cuSpMV[opA].vecXDescr,
3097                                   beta,
3098                                   matstruct->cuSpMV[opA].vecYDescr,
3099                                   cusparse_scalartype,
3100                                   cusparsestruct->spmvAlg,
3101                                   matstruct->cuSpMV[opA].spmvBuffer));
3102      #else
3103       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3104       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3105                                        mat->num_rows, mat->num_cols,
3106                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3107                                        mat->values->data().get(), mat->row_offsets->data().get(),
3108                                        mat->column_indices->data().get(), xptr, beta,
3109                                        dptr));
3110      #endif
3111     } else {
3112       if (cusparsestruct->nrows) {
3113        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3115        #else
3116         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3117         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3118                                          matstruct->alpha_one, matstruct->descr, hybMat,
3119                                          xptr, beta,
3120                                          dptr));
3121        #endif
3122       }
3123     }
3124     PetscCall(PetscLogGpuTimeEnd());
3125 
3126     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3127       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3128         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3129           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3130         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3131           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3132         }
3133       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3134         PetscCall(VecSet_SeqCUDA(zz,0));
3135       }
3136 
3137       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3138       if (compressed) {
3139         PetscCall(PetscLogGpuTimeBegin());
3140         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3141            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3142            prevent that. So I just add a ScatterAdd kernel.
3143          */
3144        #if 0
3145         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3146         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3147                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3148                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3149                          VecCUDAPlusEquals());
3150        #else
3151         PetscInt n = matstruct->cprowIndices->size();
3152         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3153        #endif
3154         PetscCall(PetscLogGpuTimeEnd());
3155       }
3156     } else {
3157       if (yy && yy != zz) {
3158         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3159       }
3160     }
3161     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3162     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3163     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3164   } catch(char *ex) {
3165     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3166   }
3167   if (yy) {
3168     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3169   } else {
3170     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3171   }
3172   PetscFunctionReturn(0);
3173 }
3174 
3175 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3176 {
3177   PetscFunctionBegin;
3178   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3179   PetscFunctionReturn(0);
3180 }
3181 
3182 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3183 {
3184   PetscObjectState   onnz = A->nonzerostate;
3185   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3186 
3187   PetscFunctionBegin;
3188   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3189   if (onnz != A->nonzerostate && cusp->deviceMat) {
3190 
3191     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3192     PetscCallCUDA(cudaFree(cusp->deviceMat));
3193     cusp->deviceMat = NULL;
3194   }
3195   PetscFunctionReturn(0);
3196 }
3197 
3198 /* --------------------------------------------------------------------------------*/
3199 /*@
3200    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3201    (the default parallel PETSc format). This matrix will ultimately pushed down
3202    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3203    assembly performance the user should preallocate the matrix storage by setting
3204    the parameter nz (or the array nnz).  By setting these parameters accurately,
3205    performance during matrix assembly can be increased by more than a factor of 50.
3206 
3207    Collective
3208 
3209    Input Parameters:
3210 +  comm - MPI communicator, set to PETSC_COMM_SELF
3211 .  m - number of rows
3212 .  n - number of columns
3213 .  nz - number of nonzeros per row (same for all rows)
3214 -  nnz - array containing the number of nonzeros in the various rows
3215          (possibly different for each row) or NULL
3216 
3217    Output Parameter:
3218 .  A - the matrix
3219 
3220    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3221    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3222    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3223 
3224    Notes:
3225    If nnz is given then nz is ignored
3226 
3227    The AIJ format (also called the Yale sparse matrix format or
3228    compressed row storage), is fully compatible with standard Fortran 77
3229    storage.  That is, the stored row and column indices can begin at
3230    either one (as in Fortran) or zero.  See the users' manual for details.
3231 
3232    Specify the preallocated storage with either nz or nnz (not both).
3233    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3234    allocation.  For large problems you MUST preallocate memory or you
3235    will get TERRIBLE performance, see the users' manual chapter on matrices.
3236 
3237    By default, this format uses inodes (identical nodes) when possible, to
3238    improve numerical efficiency of matrix-vector products and solves. We
3239    search for consecutive rows with the same nonzero structure, thereby
3240    reusing matrix information to achieve increased efficiency.
3241 
3242    Level: intermediate
3243 
3244 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3245 @*/
3246 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3247 {
3248   PetscFunctionBegin;
3249   PetscCall(MatCreate(comm,A));
3250   PetscCall(MatSetSizes(*A,m,n,m,n));
3251   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3252   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3253   PetscFunctionReturn(0);
3254 }
3255 
3256 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3257 {
3258   PetscFunctionBegin;
3259   if (A->factortype == MAT_FACTOR_NONE) {
3260     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3261   } else {
3262     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3263   }
3264   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3265   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3266   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3267   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3268   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3269   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3270   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3271   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3272   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3273   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3274   PetscCall(MatDestroy_SeqAIJ(A));
3275   PetscFunctionReturn(0);
3276 }
3277 
3278 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3279 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3280 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3281 {
3282   PetscFunctionBegin;
3283   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3284   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3289 {
3290   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3291   Mat_SeqAIJCUSPARSE *cy;
3292   Mat_SeqAIJCUSPARSE *cx;
3293   PetscScalar        *ay;
3294   const PetscScalar  *ax;
3295   CsrMatrix          *csry,*csrx;
3296 
3297   PetscFunctionBegin;
3298   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3299   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3300   if (X->ops->axpy != Y->ops->axpy) {
3301     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3302     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3303     PetscFunctionReturn(0);
3304   }
3305   /* if we are here, it means both matrices are bound to GPU */
3306   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3307   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3308   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3309   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3310   csry = (CsrMatrix*)cy->mat->mat;
3311   csrx = (CsrMatrix*)cx->mat->mat;
3312   /* see if we can turn this into a cublas axpy */
3313   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3314     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3315     if (eq) {
3316       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3317     }
3318     if (eq) str = SAME_NONZERO_PATTERN;
3319   }
3320   /* spgeam is buggy with one column */
3321   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3322 
3323   if (str == SUBSET_NONZERO_PATTERN) {
3324     PetscScalar b = 1.0;
3325 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326     size_t      bufferSize;
3327     void        *buffer;
3328 #endif
3329 
3330     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3331     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3332     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3333 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3335                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3336                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3337                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3338     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3339     PetscCall(PetscLogGpuTimeBegin());
3340     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3341                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3342                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3343                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3344     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3345     PetscCall(PetscLogGpuTimeEnd());
3346     PetscCallCUDA(cudaFree(buffer));
3347 #else
3348     PetscCall(PetscLogGpuTimeBegin());
3349     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3350                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3351                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3352                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3353     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3354     PetscCall(PetscLogGpuTimeEnd());
3355 #endif
3356     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3357     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3358     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3359     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3360   } else if (str == SAME_NONZERO_PATTERN) {
3361     cublasHandle_t cublasv2handle;
3362     PetscBLASInt   one = 1, bnz = 1;
3363 
3364     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3365     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3366     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3367     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3368     PetscCall(PetscLogGpuTimeBegin());
3369     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3370     PetscCall(PetscLogGpuFlops(2.0*bnz));
3371     PetscCall(PetscLogGpuTimeEnd());
3372     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3373     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3374     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3375   } else {
3376     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3377     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3378   }
3379   PetscFunctionReturn(0);
3380 }
3381 
3382 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3383 {
3384   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3385   PetscScalar    *ay;
3386   cublasHandle_t cublasv2handle;
3387   PetscBLASInt   one = 1, bnz = 1;
3388 
3389   PetscFunctionBegin;
3390   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3391   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3392   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3393   PetscCall(PetscLogGpuTimeBegin());
3394   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3395   PetscCall(PetscLogGpuFlops(bnz));
3396   PetscCall(PetscLogGpuTimeEnd());
3397   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3398   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3399   PetscFunctionReturn(0);
3400 }
3401 
3402 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3403 {
3404   PetscBool      both = PETSC_FALSE;
3405   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3406 
3407   PetscFunctionBegin;
3408   if (A->factortype == MAT_FACTOR_NONE) {
3409     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3410     if (spptr->mat) {
3411       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3412       if (matrix->values) {
3413         both = PETSC_TRUE;
3414         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3415       }
3416     }
3417     if (spptr->matTranspose) {
3418       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3419       if (matrix->values) {
3420         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3421       }
3422     }
3423   }
3424   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3425   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3426   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3427   else A->offloadmask = PETSC_OFFLOAD_CPU;
3428   PetscFunctionReturn(0);
3429 }
3430 
3431 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3432 {
3433   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3434 
3435   PetscFunctionBegin;
3436   if (A->factortype != MAT_FACTOR_NONE) {
3437     A->boundtocpu = flg;
3438     PetscFunctionReturn(0);
3439   }
3440   if (flg) {
3441     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3442 
3443     A->ops->scale                     = MatScale_SeqAIJ;
3444     A->ops->axpy                      = MatAXPY_SeqAIJ;
3445     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3446     A->ops->mult                      = MatMult_SeqAIJ;
3447     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3448     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3449     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3450     A->ops->multhermitiantranspose    = NULL;
3451     A->ops->multhermitiantransposeadd = NULL;
3452     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3453     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3454     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3455     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3456     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3457     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3458     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3459     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3460   } else {
3461     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3462     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3463     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3464     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3465     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3466     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3467     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3468     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3469     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3470     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3471     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3472     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3473     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3474     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3475     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3476     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3477     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3478 
3479     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3480     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3481     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3482     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3483     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3484     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3485    }
3486   A->boundtocpu = flg;
3487   if (flg && a->inode.size) {
3488     a->inode.use = PETSC_TRUE;
3489   } else {
3490     a->inode.use = PETSC_FALSE;
3491   }
3492   PetscFunctionReturn(0);
3493 }
3494 
3495 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3496 {
3497   Mat              B;
3498 
3499   PetscFunctionBegin;
3500   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3501   if (reuse == MAT_INITIAL_MATRIX) {
3502     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3503   } else if (reuse == MAT_REUSE_MATRIX) {
3504     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3505   }
3506   B = *newmat;
3507 
3508   PetscCall(PetscFree(B->defaultvectype));
3509   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3510 
3511   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3512     if (B->factortype == MAT_FACTOR_NONE) {
3513       Mat_SeqAIJCUSPARSE *spptr;
3514       PetscCall(PetscNew(&spptr));
3515       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3516       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3517       spptr->format     = MAT_CUSPARSE_CSR;
3518      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3519      #if CUSPARSE_VERSION > 11301
3520       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3521      #else
3522       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3523      #endif
3524       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3525       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3526      #endif
3527       B->spptr = spptr;
3528     } else {
3529       Mat_SeqAIJCUSPARSETriFactors *spptr;
3530 
3531       PetscCall(PetscNew(&spptr));
3532       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3533       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3534       B->spptr = spptr;
3535     }
3536     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3537   }
3538   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3539   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3540   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3541   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3542   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3543   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3544 
3545   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3546   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3547   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3548 #if defined(PETSC_HAVE_HYPRE)
3549   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3550 #endif
3551   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3552   PetscFunctionReturn(0);
3553 }
3554 
3555 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3556 {
3557   PetscFunctionBegin;
3558   PetscCall(MatCreate_SeqAIJ(B));
3559   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3560   PetscFunctionReturn(0);
3561 }
3562 
3563 /*MC
3564    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3565 
3566    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3567    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3568    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3569 
3570    Options Database Keys:
3571 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3572 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3573 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3574 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3575 
3576   Level: beginner
3577 
3578 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3579 M*/
3580 
3581 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3582 
3583 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3584 {
3585   PetscFunctionBegin;
3586   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3587   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3588   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3589   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3590   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3591 
3592   PetscFunctionReturn(0);
3593 }
3594 
3595 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3596 {
3597   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3598 
3599   PetscFunctionBegin;
3600   if (!cusp) PetscFunctionReturn(0);
3601   delete cusp->cooPerm;
3602   delete cusp->cooPerm_a;
3603   cusp->cooPerm = NULL;
3604   cusp->cooPerm_a = NULL;
3605   if (cusp->use_extended_coo) {
3606     PetscCallCUDA(cudaFree(cusp->jmap_d));
3607     PetscCallCUDA(cudaFree(cusp->perm_d));
3608   }
3609   cusp->use_extended_coo = PETSC_FALSE;
3610   PetscFunctionReturn(0);
3611 }
3612 
3613 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3614 {
3615   PetscFunctionBegin;
3616   if (*cusparsestruct) {
3617     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3618     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3619     delete (*cusparsestruct)->workVector;
3620     delete (*cusparsestruct)->rowoffsets_gpu;
3621     delete (*cusparsestruct)->cooPerm;
3622     delete (*cusparsestruct)->cooPerm_a;
3623     delete (*cusparsestruct)->csr2csc_i;
3624     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3625     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3626     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3627     PetscCall(PetscFree(*cusparsestruct));
3628   }
3629   PetscFunctionReturn(0);
3630 }
3631 
3632 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3633 {
3634   PetscFunctionBegin;
3635   if (*mat) {
3636     delete (*mat)->values;
3637     delete (*mat)->column_indices;
3638     delete (*mat)->row_offsets;
3639     delete *mat;
3640     *mat = 0;
3641   }
3642   PetscFunctionReturn(0);
3643 }
3644 
3645 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3646 {
3647   PetscFunctionBegin;
3648   if (*trifactor) {
3649     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3650     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3651     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3652     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3653     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3654    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3655     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3656    #endif
3657     PetscCall(PetscFree(*trifactor));
3658   }
3659   PetscFunctionReturn(0);
3660 }
3661 
3662 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3663 {
3664   CsrMatrix        *mat;
3665 
3666   PetscFunctionBegin;
3667   if (*matstruct) {
3668     if ((*matstruct)->mat) {
3669       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3670        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3671         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3672        #else
3673         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3674         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3675        #endif
3676       } else {
3677         mat = (CsrMatrix*)(*matstruct)->mat;
3678         CsrMatrix_Destroy(&mat);
3679       }
3680     }
3681     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3682     delete (*matstruct)->cprowIndices;
3683     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3684     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3685     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3686 
3687    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3688     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3689     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3690     for (int i=0; i<3; i++) {
3691       if (mdata->cuSpMV[i].initialized) {
3692         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3693         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3694         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3695       }
3696     }
3697    #endif
3698     delete *matstruct;
3699     *matstruct = NULL;
3700   }
3701   PetscFunctionReturn(0);
3702 }
3703 
3704 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3705 {
3706   PetscFunctionBegin;
3707   if (*trifactors) {
3708     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3709     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3710     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3711     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3712     delete (*trifactors)->rpermIndices;
3713     delete (*trifactors)->cpermIndices;
3714     delete (*trifactors)->workVector;
3715     (*trifactors)->rpermIndices = NULL;
3716     (*trifactors)->cpermIndices = NULL;
3717     (*trifactors)->workVector = NULL;
3718     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3719     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3720     (*trifactors)->init_dev_prop = PETSC_FALSE;
3721   }
3722   PetscFunctionReturn(0);
3723 }
3724 
3725 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3726 {
3727   cusparseHandle_t handle;
3728 
3729   PetscFunctionBegin;
3730   if (*trifactors) {
3731     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3732     if (handle = (*trifactors)->handle) {
3733       PetscCallCUSPARSE(cusparseDestroy(handle));
3734     }
3735     PetscCall(PetscFree(*trifactors));
3736   }
3737   PetscFunctionReturn(0);
3738 }
3739 
3740 struct IJCompare
3741 {
3742   __host__ __device__
3743   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3744   {
3745     if (t1.get<0>() < t2.get<0>()) return true;
3746     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3747     return false;
3748   }
3749 };
3750 
3751 struct IJEqual
3752 {
3753   __host__ __device__
3754   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3755   {
3756     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3757     return true;
3758   }
3759 };
3760 
3761 struct IJDiff
3762 {
3763   __host__ __device__
3764   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3765   {
3766     return t1 == t2 ? 0 : 1;
3767   }
3768 };
3769 
3770 struct IJSum
3771 {
3772   __host__ __device__
3773   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3774   {
3775     return t1||t2;
3776   }
3777 };
3778 
3779 #include <thrust/iterator/discard_iterator.h>
3780 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3781 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3782 {
3783   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3784   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3785   THRUSTARRAY                           *cooPerm_v = NULL;
3786   thrust::device_ptr<const PetscScalar> d_v;
3787   CsrMatrix                             *matrix;
3788   PetscInt                              n;
3789 
3790   PetscFunctionBegin;
3791   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3792   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3793   if (!cusp->cooPerm) {
3794     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3795     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3796     PetscFunctionReturn(0);
3797   }
3798   matrix = (CsrMatrix*)cusp->mat->mat;
3799   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3800   if (!v) {
3801     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3802     goto finalize;
3803   }
3804   n = cusp->cooPerm->size();
3805   if (isCudaMem(v)) {
3806     d_v = thrust::device_pointer_cast(v);
3807   } else {
3808     cooPerm_v = new THRUSTARRAY(n);
3809     cooPerm_v->assign(v,v+n);
3810     d_v = cooPerm_v->data();
3811     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3812   }
3813   PetscCall(PetscLogGpuTimeBegin());
3814   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3815     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3816       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3817       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3818       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3819         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3820         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3821       */
3822       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3823       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3824       delete cooPerm_w;
3825     } else {
3826       /* all nonzeros in d_v[] are unique entries */
3827       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3828                                                                 matrix->values->begin()));
3829       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3830                                                                 matrix->values->end()));
3831       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3832     }
3833   } else {
3834     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3835       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3836       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3837     } else {
3838       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3839                                                                 matrix->values->begin()));
3840       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3841                                                                 matrix->values->end()));
3842       thrust::for_each(zibit,zieit,VecCUDAEquals());
3843     }
3844   }
3845   PetscCall(PetscLogGpuTimeEnd());
3846 finalize:
3847   delete cooPerm_v;
3848   A->offloadmask = PETSC_OFFLOAD_GPU;
3849   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3850   /* shorter version of MatAssemblyEnd_SeqAIJ */
3851   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3852   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3853   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3854   a->reallocs         = 0;
3855   A->info.mallocs    += 0;
3856   A->info.nz_unneeded = 0;
3857   A->assembled = A->was_assembled = PETSC_TRUE;
3858   A->num_ass++;
3859   PetscFunctionReturn(0);
3860 }
3861 
3862 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3863 {
3864   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3865 
3866   PetscFunctionBegin;
3867   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3868   if (!cusp) PetscFunctionReturn(0);
3869   if (destroy) {
3870     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3871     delete cusp->csr2csc_i;
3872     cusp->csr2csc_i = NULL;
3873   }
3874   A->transupdated = PETSC_FALSE;
3875   PetscFunctionReturn(0);
3876 }
3877 
3878 #include <thrust/binary_search.h>
3879 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3880 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3881 {
3882   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3883   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3884   PetscInt           cooPerm_n, nzr = 0;
3885 
3886   PetscFunctionBegin;
3887   PetscCall(PetscLayoutSetUp(A->rmap));
3888   PetscCall(PetscLayoutSetUp(A->cmap));
3889   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3890   if (n != cooPerm_n) {
3891     delete cusp->cooPerm;
3892     delete cusp->cooPerm_a;
3893     cusp->cooPerm = NULL;
3894     cusp->cooPerm_a = NULL;
3895   }
3896   if (n) {
3897     THRUSTINTARRAY d_i(n);
3898     THRUSTINTARRAY d_j(n);
3899     THRUSTINTARRAY ii(A->rmap->n);
3900 
3901     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3902     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3903 
3904     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3905     d_i.assign(coo_i,coo_i+n);
3906     d_j.assign(coo_j,coo_j+n);
3907 
3908     /* Ex.
3909       n = 6
3910       coo_i = [3,3,1,4,1,4]
3911       coo_j = [3,2,2,5,2,6]
3912     */
3913     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3914     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3915 
3916     PetscCall(PetscLogGpuTimeBegin());
3917     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3918     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3919     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3920     THRUSTINTARRAY w = d_j;
3921 
3922     /*
3923       d_i     = [1,1,3,3,4,4]
3924       d_j     = [2,2,2,3,5,6]
3925       cooPerm = [2,4,1,0,3,5]
3926     */
3927     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3928 
3929     /*
3930       d_i     = [1,3,3,4,4,x]
3931                             ^ekey
3932       d_j     = [2,2,3,5,6,x]
3933                            ^nekye
3934     */
3935     if (nekey == ekey) { /* all entries are unique */
3936       delete cusp->cooPerm_a;
3937       cusp->cooPerm_a = NULL;
3938     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3939       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3940       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3941       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3942       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3943       w[0] = 0;
3944       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3945       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3946     }
3947     thrust::counting_iterator<PetscInt> search_begin(0);
3948     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3949                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3950                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3951     PetscCall(PetscLogGpuTimeEnd());
3952 
3953     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3954     a->singlemalloc = PETSC_FALSE;
3955     a->free_a       = PETSC_TRUE;
3956     a->free_ij      = PETSC_TRUE;
3957     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3958     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3959     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3960     a->nz = a->maxnz = a->i[A->rmap->n];
3961     a->rmax = 0;
3962     PetscCall(PetscMalloc1(a->nz,&a->a));
3963     PetscCall(PetscMalloc1(a->nz,&a->j));
3964     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3965     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3966     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3967     for (PetscInt i = 0; i < A->rmap->n; i++) {
3968       const PetscInt nnzr = a->i[i+1] - a->i[i];
3969       nzr += (PetscInt)!!(nnzr);
3970       a->ilen[i] = a->imax[i] = nnzr;
3971       a->rmax = PetscMax(a->rmax,nnzr);
3972     }
3973     a->nonzerorowcnt = nzr;
3974     A->preallocated = PETSC_TRUE;
3975     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3976     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3977   } else {
3978     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3979   }
3980   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3981 
3982   /* We want to allocate the CUSPARSE struct for matvec now.
3983      The code is so convoluted now that I prefer to copy zeros */
3984   PetscCall(PetscArrayzero(a->a,a->nz));
3985   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3986   A->offloadmask = PETSC_OFFLOAD_CPU;
3987   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3988   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3989   PetscFunctionReturn(0);
3990 }
3991 
3992 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3993 {
3994   Mat_SeqAIJ         *seq;
3995   Mat_SeqAIJCUSPARSE *dev;
3996   PetscBool          coo_basic = PETSC_TRUE;
3997   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3998 
3999   PetscFunctionBegin;
4000   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4001   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4002   if (coo_i) {
4003     PetscCall(PetscGetMemType(coo_i,&mtype));
4004     if (PetscMemTypeHost(mtype)) {
4005       for (PetscCount k=0; k<coo_n; k++) {
4006         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4007       }
4008     }
4009   }
4010 
4011   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4012     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4013   } else {
4014     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4015     mat->offloadmask = PETSC_OFFLOAD_CPU;
4016     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4017     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4018     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4019     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4020     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4021     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4022     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4023     dev->use_extended_coo = PETSC_TRUE;
4024   }
4025   PetscFunctionReturn(0);
4026 }
4027 
4028 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4029 {
4030   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4031   const PetscCount  grid_size = gridDim.x * blockDim.x;
4032   for (; i<nnz; i+= grid_size) {
4033     PetscScalar sum = 0.0;
4034     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4035     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4036   }
4037 }
4038 
4039 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4040 {
4041   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4042   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4043   PetscCount          Annz = seq->nz;
4044   PetscMemType        memtype;
4045   const PetscScalar   *v1 = v;
4046   PetscScalar         *Aa;
4047 
4048   PetscFunctionBegin;
4049   if (dev->use_extended_coo) {
4050     PetscCall(PetscGetMemType(v,&memtype));
4051     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4052       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4053       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4054     }
4055 
4056     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4057     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4058 
4059     if (Annz) {
4060       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4061       PetscCallCUDA(cudaPeekAtLastError());
4062     }
4063 
4064     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4065     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4066 
4067     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4068   } else {
4069     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4070   }
4071   PetscFunctionReturn(0);
4072 }
4073 
4074 /*@C
4075     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4076 
4077    Not collective
4078 
4079     Input Parameters:
4080 +   A - the matrix
4081 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4082 
4083     Output Parameters:
4084 +   ia - the CSR row pointers
4085 -   ja - the CSR column indices
4086 
4087     Level: developer
4088 
4089     Notes:
4090       When compressed is true, the CSR structure does not contain empty rows
4091 
4092 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4093 @*/
4094 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4095 {
4096   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4097   CsrMatrix          *csr;
4098   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4099 
4100   PetscFunctionBegin;
4101   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4102   if (!i || !j) PetscFunctionReturn(0);
4103   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4104   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4105   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4106   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4107   csr = (CsrMatrix*)cusp->mat->mat;
4108   if (i) {
4109     if (!compressed && a->compressedrow.use) { /* need full row offset */
4110       if (!cusp->rowoffsets_gpu) {
4111         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4112         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4113         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4114       }
4115       *i = cusp->rowoffsets_gpu->data().get();
4116     } else *i = csr->row_offsets->data().get();
4117   }
4118   if (j) *j = csr->column_indices->data().get();
4119   PetscFunctionReturn(0);
4120 }
4121 
4122 /*@C
4123     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4124 
4125    Not collective
4126 
4127     Input Parameters:
4128 +   A - the matrix
4129 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4130 
4131     Output Parameters:
4132 +   ia - the CSR row pointers
4133 -   ja - the CSR column indices
4134 
4135     Level: developer
4136 
4137 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4138 @*/
4139 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4140 {
4141   PetscFunctionBegin;
4142   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4143   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4144   if (i) *i = NULL;
4145   if (j) *j = NULL;
4146   PetscFunctionReturn(0);
4147 }
4148 
4149 /*@C
4150    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4151 
4152    Not Collective
4153 
4154    Input Parameter:
4155 .   A - a MATSEQAIJCUSPARSE matrix
4156 
4157    Output Parameter:
4158 .   a - pointer to the device data
4159 
4160    Level: developer
4161 
4162    Notes: may trigger host-device copies if up-to-date matrix data is on host
4163 
4164 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4165 @*/
4166 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4167 {
4168   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4169   CsrMatrix          *csr;
4170 
4171   PetscFunctionBegin;
4172   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4173   PetscValidPointer(a,2);
4174   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4175   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4176   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4177   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4178   csr = (CsrMatrix*)cusp->mat->mat;
4179   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4180   *a = csr->values->data().get();
4181   PetscFunctionReturn(0);
4182 }
4183 
4184 /*@C
4185    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4186 
4187    Not Collective
4188 
4189    Input Parameter:
4190 .   A - a MATSEQAIJCUSPARSE matrix
4191 
4192    Output Parameter:
4193 .   a - pointer to the device data
4194 
4195    Level: developer
4196 
4197 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4198 @*/
4199 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4200 {
4201   PetscFunctionBegin;
4202   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4203   PetscValidPointer(a,2);
4204   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4205   *a = NULL;
4206   PetscFunctionReturn(0);
4207 }
4208 
4209 /*@C
4210    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4211 
4212    Not Collective
4213 
4214    Input Parameter:
4215 .   A - a MATSEQAIJCUSPARSE matrix
4216 
4217    Output Parameter:
4218 .   a - pointer to the device data
4219 
4220    Level: developer
4221 
4222    Notes: may trigger host-device copies if up-to-date matrix data is on host
4223 
4224 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4225 @*/
4226 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4227 {
4228   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4229   CsrMatrix          *csr;
4230 
4231   PetscFunctionBegin;
4232   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4233   PetscValidPointer(a,2);
4234   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4235   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4236   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4237   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4238   csr = (CsrMatrix*)cusp->mat->mat;
4239   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4240   *a = csr->values->data().get();
4241   A->offloadmask = PETSC_OFFLOAD_GPU;
4242   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4243   PetscFunctionReturn(0);
4244 }
4245 /*@C
4246    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4247 
4248    Not Collective
4249 
4250    Input Parameter:
4251 .   A - a MATSEQAIJCUSPARSE matrix
4252 
4253    Output Parameter:
4254 .   a - pointer to the device data
4255 
4256    Level: developer
4257 
4258 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4259 @*/
4260 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4261 {
4262   PetscFunctionBegin;
4263   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4264   PetscValidPointer(a,2);
4265   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4266   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4267   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4268   *a = NULL;
4269   PetscFunctionReturn(0);
4270 }
4271 
4272 /*@C
4273    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4274 
4275    Not Collective
4276 
4277    Input Parameter:
4278 .   A - a MATSEQAIJCUSPARSE matrix
4279 
4280    Output Parameter:
4281 .   a - pointer to the device data
4282 
4283    Level: developer
4284 
4285    Notes: does not trigger host-device copies and flags data validity on the GPU
4286 
4287 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4288 @*/
4289 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4290 {
4291   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4292   CsrMatrix          *csr;
4293 
4294   PetscFunctionBegin;
4295   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4296   PetscValidPointer(a,2);
4297   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4298   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4299   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4300   csr = (CsrMatrix*)cusp->mat->mat;
4301   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4302   *a = csr->values->data().get();
4303   A->offloadmask = PETSC_OFFLOAD_GPU;
4304   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4305   PetscFunctionReturn(0);
4306 }
4307 
4308 /*@C
4309    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4310 
4311    Not Collective
4312 
4313    Input Parameter:
4314 .   A - a MATSEQAIJCUSPARSE matrix
4315 
4316    Output Parameter:
4317 .   a - pointer to the device data
4318 
4319    Level: developer
4320 
4321 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4322 @*/
4323 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4324 {
4325   PetscFunctionBegin;
4326   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4327   PetscValidPointer(a,2);
4328   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4329   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4330   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4331   *a = NULL;
4332   PetscFunctionReturn(0);
4333 }
4334 
4335 struct IJCompare4
4336 {
4337   __host__ __device__
4338   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4339   {
4340     if (t1.get<0>() < t2.get<0>()) return true;
4341     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4342     return false;
4343   }
4344 };
4345 
4346 struct Shift
4347 {
4348   int _shift;
4349 
4350   Shift(int shift) : _shift(shift) {}
4351   __host__ __device__
4352   inline int operator() (const int &c)
4353   {
4354     return c + _shift;
4355   }
4356 };
4357 
4358 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4359 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4360 {
4361   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4362   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4363   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4364   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4365   PetscInt                     Annz,Bnnz;
4366   cusparseStatus_t             stat;
4367   PetscInt                     i,m,n,zero = 0;
4368 
4369   PetscFunctionBegin;
4370   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4371   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4372   PetscValidPointer(C,4);
4373   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4374   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4375   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4376   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4377   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4378   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4379   if (reuse == MAT_INITIAL_MATRIX) {
4380     m     = A->rmap->n;
4381     n     = A->cmap->n + B->cmap->n;
4382     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4383     PetscCall(MatSetSizes(*C,m,n,m,n));
4384     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4385     c     = (Mat_SeqAIJ*)(*C)->data;
4386     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4387     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4388     Ccsr  = new CsrMatrix;
4389     Cmat->cprowIndices      = NULL;
4390     c->compressedrow.use    = PETSC_FALSE;
4391     c->compressedrow.nrows  = 0;
4392     c->compressedrow.i      = NULL;
4393     c->compressedrow.rindex = NULL;
4394     Ccusp->workVector       = NULL;
4395     Ccusp->nrows    = m;
4396     Ccusp->mat      = Cmat;
4397     Ccusp->mat->mat = Ccsr;
4398     Ccsr->num_rows  = m;
4399     Ccsr->num_cols  = n;
4400     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4401     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4402     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4403     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4404     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4405     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4406     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4407     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4408     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4409     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4410     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4411     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4412     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4413 
4414     Acsr = (CsrMatrix*)Acusp->mat->mat;
4415     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4416     Annz = (PetscInt)Acsr->column_indices->size();
4417     Bnnz = (PetscInt)Bcsr->column_indices->size();
4418     c->nz = Annz + Bnnz;
4419     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4420     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4421     Ccsr->values = new THRUSTARRAY(c->nz);
4422     Ccsr->num_entries = c->nz;
4423     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4424     if (c->nz) {
4425       auto Acoo = new THRUSTINTARRAY32(Annz);
4426       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4427       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4428       THRUSTINTARRAY32 *Aroff,*Broff;
4429 
4430       if (a->compressedrow.use) { /* need full row offset */
4431         if (!Acusp->rowoffsets_gpu) {
4432           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4433           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4434           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4435         }
4436         Aroff = Acusp->rowoffsets_gpu;
4437       } else Aroff = Acsr->row_offsets;
4438       if (b->compressedrow.use) { /* need full row offset */
4439         if (!Bcusp->rowoffsets_gpu) {
4440           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4441           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4442           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4443         }
4444         Broff = Bcusp->rowoffsets_gpu;
4445       } else Broff = Bcsr->row_offsets;
4446       PetscCall(PetscLogGpuTimeBegin());
4447       stat = cusparseXcsr2coo(Acusp->handle,
4448                               Aroff->data().get(),
4449                               Annz,
4450                               m,
4451                               Acoo->data().get(),
4452                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4453       stat = cusparseXcsr2coo(Bcusp->handle,
4454                               Broff->data().get(),
4455                               Bnnz,
4456                               m,
4457                               Bcoo->data().get(),
4458                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4459       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4460       auto Aperm = thrust::make_constant_iterator(1);
4461       auto Bperm = thrust::make_constant_iterator(0);
4462 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4463       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4464       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4465 #else
4466       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4467       auto Bcib = Bcsr->column_indices->begin();
4468       auto Bcie = Bcsr->column_indices->end();
4469       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4470 #endif
4471       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4472       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4473       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4474       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4475       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4476       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4477       auto p1 = Ccusp->cooPerm->begin();
4478       auto p2 = Ccusp->cooPerm->begin();
4479       thrust::advance(p2,Annz);
4480       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4481 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4482       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4483 #endif
4484       auto cci = thrust::make_counting_iterator(zero);
4485       auto cce = thrust::make_counting_iterator(c->nz);
4486 #if 0 //Errors on SUMMIT cuda 11.1.0
4487       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4488 #else
4489       auto pred = thrust::identity<int>();
4490       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4491       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4492 #endif
4493       stat = cusparseXcoo2csr(Ccusp->handle,
4494                               Ccoo->data().get(),
4495                               c->nz,
4496                               m,
4497                               Ccsr->row_offsets->data().get(),
4498                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4499       PetscCall(PetscLogGpuTimeEnd());
4500       delete wPerm;
4501       delete Acoo;
4502       delete Bcoo;
4503       delete Ccoo;
4504 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4505       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4506                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4507                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4508                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4509 #endif
4510       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4511         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4512         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4513         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4514         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4515         CsrMatrix *CcsrT = new CsrMatrix;
4516         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4517         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4518 
4519         (*C)->form_explicit_transpose = PETSC_TRUE;
4520         (*C)->transupdated = PETSC_TRUE;
4521         Ccusp->rowoffsets_gpu = NULL;
4522         CmatT->cprowIndices = NULL;
4523         CmatT->mat = CcsrT;
4524         CcsrT->num_rows = n;
4525         CcsrT->num_cols = m;
4526         CcsrT->num_entries = c->nz;
4527 
4528         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4529         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4530         CcsrT->values = new THRUSTARRAY(c->nz);
4531 
4532         PetscCall(PetscLogGpuTimeBegin());
4533         auto rT = CcsrT->row_offsets->begin();
4534         if (AT) {
4535           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4536           thrust::advance(rT,-1);
4537         }
4538         if (BT) {
4539           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4540           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4541           thrust::copy(titb,tite,rT);
4542         }
4543         auto cT = CcsrT->column_indices->begin();
4544         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4545         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4546         auto vT = CcsrT->values->begin();
4547         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4548         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4549         PetscCall(PetscLogGpuTimeEnd());
4550 
4551         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4552         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4553         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4554         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4555         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4556         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4557         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4558         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4559         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4560 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4561         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4562                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4563                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4564                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4565 #endif
4566         Ccusp->matTranspose = CmatT;
4567       }
4568     }
4569 
4570     c->singlemalloc = PETSC_FALSE;
4571     c->free_a       = PETSC_TRUE;
4572     c->free_ij      = PETSC_TRUE;
4573     PetscCall(PetscMalloc1(m+1,&c->i));
4574     PetscCall(PetscMalloc1(c->nz,&c->j));
4575     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4576       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4577       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4578       ii   = *Ccsr->row_offsets;
4579       jj   = *Ccsr->column_indices;
4580       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4581       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4582     } else {
4583       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4584       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4585     }
4586     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4587     PetscCall(PetscMalloc1(m,&c->ilen));
4588     PetscCall(PetscMalloc1(m,&c->imax));
4589     c->maxnz = c->nz;
4590     c->nonzerorowcnt = 0;
4591     c->rmax = 0;
4592     for (i = 0; i < m; i++) {
4593       const PetscInt nn = c->i[i+1] - c->i[i];
4594       c->ilen[i] = c->imax[i] = nn;
4595       c->nonzerorowcnt += (PetscInt)!!nn;
4596       c->rmax = PetscMax(c->rmax,nn);
4597     }
4598     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4599     PetscCall(PetscMalloc1(c->nz,&c->a));
4600     (*C)->nonzerostate++;
4601     PetscCall(PetscLayoutSetUp((*C)->rmap));
4602     PetscCall(PetscLayoutSetUp((*C)->cmap));
4603     Ccusp->nonzerostate = (*C)->nonzerostate;
4604     (*C)->preallocated  = PETSC_TRUE;
4605   } else {
4606     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4607     c = (Mat_SeqAIJ*)(*C)->data;
4608     if (c->nz) {
4609       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4610       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4611       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4612       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4613       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4614       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4615       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4616       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4617       Acsr = (CsrMatrix*)Acusp->mat->mat;
4618       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4619       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4620       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4621       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4622       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4623       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4624       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4625       auto pmid = Ccusp->cooPerm->begin();
4626       thrust::advance(pmid,Acsr->num_entries);
4627       PetscCall(PetscLogGpuTimeBegin());
4628       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4629                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4630       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4631                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4632       thrust::for_each(zibait,zieait,VecCUDAEquals());
4633       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4634                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4635       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4636                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4637       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4638       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4639       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4640         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4641         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4642         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4643         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4644         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4645         auto vT = CcsrT->values->begin();
4646         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4647         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4648         (*C)->transupdated = PETSC_TRUE;
4649       }
4650       PetscCall(PetscLogGpuTimeEnd());
4651     }
4652   }
4653   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4654   (*C)->assembled     = PETSC_TRUE;
4655   (*C)->was_assembled = PETSC_FALSE;
4656   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4657   PetscFunctionReturn(0);
4658 }
4659 
4660 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4661 {
4662   bool              dmem;
4663   const PetscScalar *av;
4664 
4665   PetscFunctionBegin;
4666   dmem = isCudaMem(v);
4667   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4668   if (n && idx) {
4669     THRUSTINTARRAY widx(n);
4670     widx.assign(idx,idx+n);
4671     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4672 
4673     THRUSTARRAY *w = NULL;
4674     thrust::device_ptr<PetscScalar> dv;
4675     if (dmem) {
4676       dv = thrust::device_pointer_cast(v);
4677     } else {
4678       w = new THRUSTARRAY(n);
4679       dv = w->data();
4680     }
4681     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4682 
4683     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4684     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4685     thrust::for_each(zibit,zieit,VecCUDAEquals());
4686     if (w) {
4687       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4688     }
4689     delete w;
4690   } else {
4691     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4692   }
4693   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4694   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4695   PetscFunctionReturn(0);
4696 }
4697