xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 40750d414264c8a755a615ff0e2cc923e44b4f14)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
96 {
97   PetscFunctionBegin;
98   *type = MATSOLVERCUSPARSE;
99   PetscFunctionReturn(0);
100 }
101 
102 /*MC
103   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108   algorithms are not recommended. This class does NOT support direct solver operations.
109 
110   Level: beginner
111 
112 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
113 M*/
114 
115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
116 {
117   PetscInt       n = A->rmap->n;
118 
119   PetscFunctionBegin;
120   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
121   PetscCall(MatSetSizes(*B,n,n,n,n));
122   (*B)->factortype = ftype;
123   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
124 
125   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
127     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
128     if (!A->boundtocpu) {
129       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
130       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
131     } else {
132       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
133       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
134     }
135     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
136     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
137     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
139     if (!A->boundtocpu) {
140       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
142     } else {
143       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
144       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
145     }
146     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
147     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
148   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149 
150   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
151   (*B)->canuseordering = PETSC_TRUE;
152   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
153   PetscFunctionReturn(0);
154 }
155 
156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157 {
158   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
159 
160   PetscFunctionBegin;
161   switch (op) {
162   case MAT_CUSPARSE_MULT:
163     cusparsestruct->format = format;
164     break;
165   case MAT_CUSPARSE_ALL:
166     cusparsestruct->format = format;
167     break;
168   default:
169     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@
175    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176    operation. Only the MatMult operation can use different GPU storage formats
177    for MPIAIJCUSPARSE matrices.
178    Not Collective
179 
180    Input Parameters:
181 +  A - Matrix of type SEQAIJCUSPARSE
182 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
183 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184 
185    Output Parameter:
186 
187    Level: intermediate
188 
189 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
190 @*/
191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192 {
193   PetscFunctionBegin;
194   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195   PetscCall(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)));
196   PetscFunctionReturn(0);
197 }
198 
199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200 {
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203   PetscFunctionBegin;
204   cusparsestruct->use_cpu_solve = use_cpu;
205   PetscFunctionReturn(0);
206 }
207 
208 /*@
209    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210 
211    Input Parameters:
212 +  A - Matrix of type SEQAIJCUSPARSE
213 -  use_cpu - set flag for using the built-in CPU MatSolve
214 
215    Output Parameter:
216 
217    Notes:
218    The cuSparse LU solver currently computes the factors with the built-in CPU method
219    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221 
222    Level: intermediate
223 
224 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
225 @*/
226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230   PetscCall(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)));
231   PetscFunctionReturn(0);
232 }
233 
234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235 {
236   PetscFunctionBegin;
237   switch (op) {
238     case MAT_FORM_EXPLICIT_TRANSPOSE:
239       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
240       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
241       A->form_explicit_transpose = flg;
242       break;
243     default:
244       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
245       break;
246   }
247   PetscFunctionReturn(0);
248 }
249 
250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251 
252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253 {
254   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255   IS             isrow = b->row,iscol = b->col;
256   PetscBool      row_identity,col_identity;
257   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258 
259   PetscFunctionBegin;
260   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
261   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262   B->offloadmask = PETSC_OFFLOAD_CPU;
263   /* determine which version of MatSolve needs to be used. */
264   PetscCall(ISIdentity(isrow,&row_identity));
265   PetscCall(ISIdentity(iscol,&col_identity));
266   if (row_identity && col_identity) {
267     if (!cusparsestruct->use_cpu_solve) {
268       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
269       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
270     }
271     B->ops->matsolve = NULL;
272     B->ops->matsolvetranspose = NULL;
273   } else {
274     if (!cusparsestruct->use_cpu_solve) {
275       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
276       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
277     }
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   }
281 
282   /* get the triangular factors */
283   if (!cusparsestruct->use_cpu_solve) {
284     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
290 {
291   PetscErrorCode           ierr;
292   MatCUSPARSEStorageFormat format;
293   PetscBool                flg;
294   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
295 
296   PetscFunctionBegin;
297   PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
298   if (A->factortype == MAT_FACTOR_NONE) {
299     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
300                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
301     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
302 
303     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
304                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
305     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
306     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
307     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
309     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
310                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr);
311     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
312 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
313     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
314 #else
315     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
316 #endif
317     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
318                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr);
319     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
320 
321     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
322                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr);
323     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
324    #endif
325   }
326   PetscCall(PetscOptionsTail());
327   PetscFunctionReturn(0);
328 }
329 
330 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331 {
332   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333 
334   PetscFunctionBegin;
335   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
336   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344 
345   PetscFunctionBegin;
346   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
347   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
348   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
349   PetscFunctionReturn(0);
350 }
351 
352 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
353 {
354   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
355 
356   PetscFunctionBegin;
357   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
358   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
359   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
364 {
365   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
366 
367   PetscFunctionBegin;
368   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
369   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
370   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
371   PetscFunctionReturn(0);
372 }
373 
374 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
375 {
376   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
377   PetscInt                          n = A->rmap->n;
378   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
379   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
380   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
381   const MatScalar                   *aa = a->a,*v;
382   PetscInt                          *AiLo, *AjLo;
383   PetscInt                          i,nz, nzLower, offset, rowOffset;
384 
385   PetscFunctionBegin;
386   if (!n) PetscFunctionReturn(0);
387   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
388     try {
389       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
390       nzLower=n+ai[n]-ai[1];
391       if (!loTriFactor) {
392         PetscScalar                       *AALo;
393 
394         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
395 
396         /* Allocate Space for the lower triangular matrix */
397         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
398         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
399 
400         /* Fill the lower triangular matrix */
401         AiLo[0]  = (PetscInt) 0;
402         AiLo[n]  = nzLower;
403         AjLo[0]  = (PetscInt) 0;
404         AALo[0]  = (MatScalar) 1.0;
405         v        = aa;
406         vi       = aj;
407         offset   = 1;
408         rowOffset= 1;
409         for (i=1; i<n; i++) {
410           nz = ai[i+1] - ai[i];
411           /* additional 1 for the term on the diagonal */
412           AiLo[i]    = rowOffset;
413           rowOffset += nz+1;
414 
415           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
416           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
417 
418           offset      += nz;
419           AjLo[offset] = (PetscInt) i;
420           AALo[offset] = (MatScalar) 1.0;
421           offset      += 1;
422 
423           v  += nz;
424           vi += nz;
425         }
426 
427         /* allocate space for the triangular factor information */
428         PetscCall(PetscNew(&loTriFactor));
429         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
430         /* Create the matrix description */
431         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
432         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
433        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
435        #else
436         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
437        #endif
438         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
439         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
440 
441         /* set the operation */
442         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443 
444         /* set the matrix */
445         loTriFactor->csrMat = new CsrMatrix;
446         loTriFactor->csrMat->num_rows = n;
447         loTriFactor->csrMat->num_cols = n;
448         loTriFactor->csrMat->num_entries = nzLower;
449 
450         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
451         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
452 
453         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
454         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
455 
456         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
457         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
458 
459         /* Create the solve analysis information */
460         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
461         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
462       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
463         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
464                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
465                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
466                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
467                                                &loTriFactor->solveBufferSize));
468         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
469       #endif
470 
471         /* perform the solve analysis */
472         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
473                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
474                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
475                                          loTriFactor->csrMat->column_indices->data().get(),
476                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
477                                          loTriFactor->solveInfo,
478                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
479                                          #else
480                                          loTriFactor->solveInfo));
481                                          #endif
482         PetscCallCUDA(WaitForCUDA());
483         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
484 
485         /* assign the pointer */
486         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
487         loTriFactor->AA_h = AALo;
488         PetscCallCUDA(cudaFreeHost(AiLo));
489         PetscCallCUDA(cudaFreeHost(AjLo));
490         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
491       } else { /* update values only */
492         if (!loTriFactor->AA_h) {
493           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
494         }
495         /* Fill the lower triangular matrix */
496         loTriFactor->AA_h[0]  = 1.0;
497         v        = aa;
498         vi       = aj;
499         offset   = 1;
500         for (i=1; i<n; i++) {
501           nz = ai[i+1] - ai[i];
502           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
503           offset      += nz;
504           loTriFactor->AA_h[offset] = 1.0;
505           offset      += 1;
506           v  += nz;
507         }
508         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
509         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
510       }
511     } catch(char *ex) {
512       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
513     }
514   }
515   PetscFunctionReturn(0);
516 }
517 
518 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
519 {
520   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
521   PetscInt                          n = A->rmap->n;
522   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
523   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
524   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
525   const MatScalar                   *aa = a->a,*v;
526   PetscInt                          *AiUp, *AjUp;
527   PetscInt                          i,nz, nzUpper, offset;
528 
529   PetscFunctionBegin;
530   if (!n) PetscFunctionReturn(0);
531   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532     try {
533       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534       nzUpper = adiag[0]-adiag[n];
535       if (!upTriFactor) {
536         PetscScalar *AAUp;
537 
538         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
539 
540         /* Allocate Space for the upper triangular matrix */
541         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
542         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
543 
544         /* Fill the upper triangular matrix */
545         AiUp[0]=(PetscInt) 0;
546         AiUp[n]=nzUpper;
547         offset = nzUpper;
548         for (i=n-1; i>=0; i--) {
549           v  = aa + adiag[i+1] + 1;
550           vi = aj + adiag[i+1] + 1;
551 
552           /* number of elements NOT on the diagonal */
553           nz = adiag[i] - adiag[i+1]-1;
554 
555           /* decrement the offset */
556           offset -= (nz+1);
557 
558           /* first, set the diagonal elements */
559           AjUp[offset] = (PetscInt) i;
560           AAUp[offset] = (MatScalar)1./v[nz];
561           AiUp[i]      = AiUp[i+1] - (nz+1);
562 
563           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
564           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
565         }
566 
567         /* allocate space for the triangular factor information */
568         PetscCall(PetscNew(&upTriFactor));
569         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
570 
571         /* Create the matrix description */
572         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
573         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
574        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
576        #else
577         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
578        #endif
579         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
580         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
581 
582         /* set the operation */
583         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584 
585         /* set the matrix */
586         upTriFactor->csrMat = new CsrMatrix;
587         upTriFactor->csrMat->num_rows = n;
588         upTriFactor->csrMat->num_cols = n;
589         upTriFactor->csrMat->num_entries = nzUpper;
590 
591         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593 
594         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596 
597         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599 
600         /* Create the solve analysis information */
601         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
602         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
603       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608                                                &upTriFactor->solveBufferSize));
609         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
610       #endif
611 
612         /* perform the solve analysis */
613         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616                                          upTriFactor->csrMat->column_indices->data().get(),
617                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618                                          upTriFactor->solveInfo,
619                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
620                                          #else
621                                          upTriFactor->solveInfo));
622                                          #endif
623         PetscCallCUDA(WaitForCUDA());
624         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
625 
626         /* assign the pointer */
627         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628         upTriFactor->AA_h = AAUp;
629         PetscCallCUDA(cudaFreeHost(AiUp));
630         PetscCallCUDA(cudaFreeHost(AjUp));
631         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
632       } else {
633         if (!upTriFactor->AA_h) {
634           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
635         }
636         /* Fill the upper triangular matrix */
637         offset = nzUpper;
638         for (i=n-1; i>=0; i--) {
639           v  = aa + adiag[i+1] + 1;
640 
641           /* number of elements NOT on the diagonal */
642           nz = adiag[i] - adiag[i+1]-1;
643 
644           /* decrement the offset */
645           offset -= (nz+1);
646 
647           /* first, set the diagonal elements */
648           upTriFactor->AA_h[offset] = 1./v[nz];
649           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
650         }
651         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
653       }
654     } catch(char *ex) {
655       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656     }
657   }
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662 {
663   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
664   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
665   IS                           isrow = a->row,iscol = a->icol;
666   PetscBool                    row_identity,col_identity;
667   PetscInt                     n = A->rmap->n;
668 
669   PetscFunctionBegin;
670   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
671   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
672   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
673 
674   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
675   cusparseTriFactors->nnz=a->nz;
676 
677   A->offloadmask = PETSC_OFFLOAD_BOTH;
678   /* lower triangular indices */
679   PetscCall(ISIdentity(isrow,&row_identity));
680   if (!row_identity && !cusparseTriFactors->rpermIndices) {
681     const PetscInt *r;
682 
683     PetscCall(ISGetIndices(isrow,&r));
684     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
685     cusparseTriFactors->rpermIndices->assign(r, r+n);
686     PetscCall(ISRestoreIndices(isrow,&r));
687     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
688   }
689 
690   /* upper triangular indices */
691   PetscCall(ISIdentity(iscol,&col_identity));
692   if (!col_identity && !cusparseTriFactors->cpermIndices) {
693     const PetscInt *c;
694 
695     PetscCall(ISGetIndices(iscol,&c));
696     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
697     cusparseTriFactors->cpermIndices->assign(c, c+n);
698     PetscCall(ISRestoreIndices(iscol,&c));
699     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
700   }
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
705 {
706   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
707   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
708   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
709   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
710   PetscInt                          *AiUp, *AjUp;
711   PetscScalar                       *AAUp;
712   PetscScalar                       *AALo;
713   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
714   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
715   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
716   const MatScalar                   *aa = b->a,*v;
717 
718   PetscFunctionBegin;
719   if (!n) PetscFunctionReturn(0);
720   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
721     try {
722       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
723       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
724       if (!upTriFactor && !loTriFactor) {
725         /* Allocate Space for the upper triangular matrix */
726         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
727         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
728 
729         /* Fill the upper triangular matrix */
730         AiUp[0]=(PetscInt) 0;
731         AiUp[n]=nzUpper;
732         offset = 0;
733         for (i=0; i<n; i++) {
734           /* set the pointers */
735           v  = aa + ai[i];
736           vj = aj + ai[i];
737           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
738 
739           /* first, set the diagonal elements */
740           AjUp[offset] = (PetscInt) i;
741           AAUp[offset] = (MatScalar)1.0/v[nz];
742           AiUp[i]      = offset;
743           AALo[offset] = (MatScalar)1.0/v[nz];
744 
745           offset+=1;
746           if (nz>0) {
747             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
748             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
749             for (j=offset; j<offset+nz; j++) {
750               AAUp[j] = -AAUp[j];
751               AALo[j] = AAUp[j]/v[nz];
752             }
753             offset+=nz;
754           }
755         }
756 
757         /* allocate space for the triangular factor information */
758         PetscCall(PetscNew(&upTriFactor));
759         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
760 
761         /* Create the matrix description */
762         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
763         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
764        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
765         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
766        #else
767         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
768        #endif
769         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
770         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
771 
772         /* set the matrix */
773         upTriFactor->csrMat = new CsrMatrix;
774         upTriFactor->csrMat->num_rows = A->rmap->n;
775         upTriFactor->csrMat->num_cols = A->cmap->n;
776         upTriFactor->csrMat->num_entries = a->nz;
777 
778         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
779         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
780 
781         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
782         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
783 
784         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
785         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
786 
787         /* set the operation */
788         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
789 
790         /* Create the solve analysis information */
791         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
792         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
793       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
794         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
795                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
796                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
797                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
798                                                &upTriFactor->solveBufferSize));
799         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
800       #endif
801 
802         /* perform the solve analysis */
803         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
804                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
805                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
806                                          upTriFactor->csrMat->column_indices->data().get(),
807                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
808                                          upTriFactor->solveInfo,
809                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
810                                          #else
811                                          upTriFactor->solveInfo));
812                                          #endif
813         PetscCallCUDA(WaitForCUDA());
814         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
815 
816         /* assign the pointer */
817         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
818 
819         /* allocate space for the triangular factor information */
820         PetscCall(PetscNew(&loTriFactor));
821         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822 
823         /* Create the matrix description */
824         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
825         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
826        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
827         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
828        #else
829         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
830        #endif
831         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
832         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
833 
834         /* set the operation */
835         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
836 
837         /* set the matrix */
838         loTriFactor->csrMat = new CsrMatrix;
839         loTriFactor->csrMat->num_rows = A->rmap->n;
840         loTriFactor->csrMat->num_cols = A->cmap->n;
841         loTriFactor->csrMat->num_entries = a->nz;
842 
843         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
844         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
845 
846         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
847         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
848 
849         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
850         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
851 
852         /* Create the solve analysis information */
853         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
854         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
855       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
856         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
857                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
858                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
859                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
860                                                &loTriFactor->solveBufferSize));
861         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
862       #endif
863 
864         /* perform the solve analysis */
865         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
866                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
867                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
868                                          loTriFactor->csrMat->column_indices->data().get(),
869                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
870                                          loTriFactor->solveInfo,
871                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
872                                          #else
873                                          loTriFactor->solveInfo));
874                                          #endif
875         PetscCallCUDA(WaitForCUDA());
876         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
877 
878         /* assign the pointer */
879         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
880 
881         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
882         PetscCallCUDA(cudaFreeHost(AiUp));
883         PetscCallCUDA(cudaFreeHost(AjUp));
884       } else {
885         /* Fill the upper triangular matrix */
886         offset = 0;
887         for (i=0; i<n; i++) {
888           /* set the pointers */
889           v  = aa + ai[i];
890           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
891 
892           /* first, set the diagonal elements */
893           AAUp[offset] = 1.0/v[nz];
894           AALo[offset] = 1.0/v[nz];
895 
896           offset+=1;
897           if (nz>0) {
898             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
899             for (j=offset; j<offset+nz; j++) {
900               AAUp[j] = -AAUp[j];
901               AALo[j] = AAUp[j]/v[nz];
902             }
903             offset+=nz;
904           }
905         }
906         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
910         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
911       }
912       PetscCallCUDA(cudaFreeHost(AAUp));
913       PetscCallCUDA(cudaFreeHost(AALo));
914     } catch(char *ex) {
915       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
916     }
917   }
918   PetscFunctionReturn(0);
919 }
920 
921 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
922 {
923   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
924   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925   IS                           ip = a->row;
926   PetscBool                    perm_identity;
927   PetscInt                     n = A->rmap->n;
928 
929   PetscFunctionBegin;
930   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
931   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
932   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
933   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
934 
935   A->offloadmask = PETSC_OFFLOAD_BOTH;
936 
937   /* lower triangular indices */
938   PetscCall(ISIdentity(ip,&perm_identity));
939   if (!perm_identity) {
940     IS             iip;
941     const PetscInt *irip,*rip;
942 
943     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
944     PetscCall(ISGetIndices(iip,&irip));
945     PetscCall(ISGetIndices(ip,&rip));
946     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
947     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
948     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
950     PetscCall(ISRestoreIndices(iip,&irip));
951     PetscCall(ISDestroy(&iip));
952     PetscCall(ISRestoreIndices(ip,&rip));
953     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
954   }
955   PetscFunctionReturn(0);
956 }
957 
958 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
959 {
960   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
961   IS             ip = b->row;
962   PetscBool      perm_identity;
963 
964   PetscFunctionBegin;
965   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
966   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
967   B->offloadmask = PETSC_OFFLOAD_CPU;
968   /* determine which version of MatSolve needs to be used. */
969   PetscCall(ISIdentity(ip,&perm_identity));
970   if (perm_identity) {
971     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
972     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
973     B->ops->matsolve = NULL;
974     B->ops->matsolvetranspose = NULL;
975   } else {
976     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
977     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
978     B->ops->matsolve = NULL;
979     B->ops->matsolvetranspose = NULL;
980   }
981 
982   /* get the triangular factors */
983   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
984   PetscFunctionReturn(0);
985 }
986 
987 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
988 {
989   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
990   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
991   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
992   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
993   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
994   cusparseIndexBase_t               indexBase;
995   cusparseMatrixType_t              matrixType;
996   cusparseFillMode_t                fillMode;
997   cusparseDiagType_t                diagType;
998 
999   PetscFunctionBegin;
1000   /* allocate space for the transpose of the lower triangular factor */
1001   PetscCall(PetscNew(&loTriFactorT));
1002   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1003 
1004   /* set the matrix descriptors of the lower triangular factor */
1005   matrixType = cusparseGetMatType(loTriFactor->descr);
1006   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1007   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1008     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1009   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1010 
1011   /* Create the matrix description */
1012   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1013   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1014   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1015   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1016   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1017 
1018   /* set the operation */
1019   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1020 
1021   /* allocate GPU space for the CSC of the lower triangular factor*/
1022   loTriFactorT->csrMat = new CsrMatrix;
1023   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1024   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1025   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1026   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1027   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1028   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1029 
1030   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1031 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1032   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1033                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1034                                                loTriFactor->csrMat->values->data().get(),
1035                                                loTriFactor->csrMat->row_offsets->data().get(),
1036                                                loTriFactor->csrMat->column_indices->data().get(),
1037                                                loTriFactorT->csrMat->values->data().get(),
1038                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1040                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1041   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1042 #endif
1043 
1044   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1045   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                   loTriFactor->csrMat->values->data().get(),
1048                                   loTriFactor->csrMat->row_offsets->data().get(),
1049                                   loTriFactor->csrMat->column_indices->data().get(),
1050                                   loTriFactorT->csrMat->values->data().get(),
1051                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1052                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1053                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1054                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1055                                   #else
1056                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1057                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1058                                   #endif
1059   PetscCallCUDA(WaitForCUDA());
1060   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1061 
1062   /* Create the solve analysis information */
1063   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1064   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070                                          &loTriFactorT->solveBufferSize));
1071   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1072 #endif
1073 
1074   /* perform the solve analysis */
1075   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                                    loTriFactorT->csrMat->column_indices->data().get(),
1079                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080                                    loTriFactorT->solveInfo,
1081                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1082                                    #else
1083                                    loTriFactorT->solveInfo));
1084                                    #endif
1085   PetscCallCUDA(WaitForCUDA());
1086   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1087 
1088   /* assign the pointer */
1089   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1090 
1091   /*********************************************/
1092   /* Now the Transpose of the Upper Tri Factor */
1093   /*********************************************/
1094 
1095   /* allocate space for the transpose of the upper triangular factor */
1096   PetscCall(PetscNew(&upTriFactorT));
1097   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1098 
1099   /* set the matrix descriptors of the upper triangular factor */
1100   matrixType = cusparseGetMatType(upTriFactor->descr);
1101   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1102   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1103     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1104   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1105 
1106   /* Create the matrix description */
1107   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1108   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1109   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1110   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1111   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1112 
1113   /* set the operation */
1114   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1115 
1116   /* allocate GPU space for the CSC of the upper triangular factor*/
1117   upTriFactorT->csrMat = new CsrMatrix;
1118   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1119   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1120   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1121   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1122   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1123   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1124 
1125   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1126 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1127   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1128                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1129                                                upTriFactor->csrMat->values->data().get(),
1130                                                upTriFactor->csrMat->row_offsets->data().get(),
1131                                                upTriFactor->csrMat->column_indices->data().get(),
1132                                                upTriFactorT->csrMat->values->data().get(),
1133                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1134                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1135                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1136   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1137 #endif
1138 
1139   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1140   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1141                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1142                                   upTriFactor->csrMat->values->data().get(),
1143                                   upTriFactor->csrMat->row_offsets->data().get(),
1144                                   upTriFactor->csrMat->column_indices->data().get(),
1145                                   upTriFactorT->csrMat->values->data().get(),
1146                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1148                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1149                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1150                                   #else
1151                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1152                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1153                                  #endif
1154 
1155   PetscCallCUDA(WaitForCUDA());
1156   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1157 
1158   /* Create the solve analysis information */
1159   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1160   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1161   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1162   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1163                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1164                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1166                                          &upTriFactorT->solveBufferSize));
1167   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1168   #endif
1169 
1170   /* perform the solve analysis */
1171   /* christ, would it have killed you to put this stuff in a function????????? */
1172   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175                                    upTriFactorT->csrMat->column_indices->data().get(),
1176                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1177                                    upTriFactorT->solveInfo,
1178                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1179                                    #else
1180                                    upTriFactorT->solveInfo));
1181                                    #endif
1182 
1183   PetscCallCUDA(WaitForCUDA());
1184   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1185 
1186   /* assign the pointer */
1187   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 struct PetscScalarToPetscInt
1192 {
1193   __host__ __device__
1194   PetscInt operator()(PetscScalar s)
1195   {
1196     return (PetscInt)PetscRealPart(s);
1197   }
1198 };
1199 
1200 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1201 {
1202   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1204   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205   cusparseStatus_t             stat;
1206   cusparseIndexBase_t          indexBase;
1207 
1208   PetscFunctionBegin;
1209   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1210   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1211   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1212   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1213   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1214   if (A->transupdated) PetscFunctionReturn(0);
1215   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1216   PetscCall(PetscLogGpuTimeBegin());
1217   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1218     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1219   }
1220   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1221     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1222     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1223     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1224     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1225     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1226 
1227     /* set alpha and beta */
1228     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1229     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1230     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1231     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1232     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1233     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1234 
1235     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1236       CsrMatrix *matrixT = new CsrMatrix;
1237       matstructT->mat = matrixT;
1238       matrixT->num_rows = A->cmap->n;
1239       matrixT->num_cols = A->rmap->n;
1240       matrixT->num_entries = a->nz;
1241       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1242       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1243       matrixT->values = new THRUSTARRAY(a->nz);
1244 
1245       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1246       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1247 
1248      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1249       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1250         stat = cusparseCreateCsr(&matstructT->matDescr,
1251                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1252                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1253                                matrixT->values->data().get(),
1254                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1255                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1256       #else
1257         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1258            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1259 
1260            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1261            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1262            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1263         */
1264         if (matrixT->num_entries) {
1265           stat = cusparseCreateCsr(&matstructT->matDescr,
1266                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1267                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1268                                  matrixT->values->data().get(),
1269                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1270                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1271 
1272         } else {
1273           matstructT->matDescr = NULL;
1274           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1275         }
1276       #endif
1277      #endif
1278     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1279    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1280       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1281    #else
1282       CsrMatrix *temp  = new CsrMatrix;
1283       CsrMatrix *tempT = new CsrMatrix;
1284       /* First convert HYB to CSR */
1285       temp->num_rows = A->rmap->n;
1286       temp->num_cols = A->cmap->n;
1287       temp->num_entries = a->nz;
1288       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1290       temp->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_hyb2csr(cusparsestruct->handle,
1293                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1297 
1298       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1299       tempT->num_rows = A->rmap->n;
1300       tempT->num_cols = A->cmap->n;
1301       tempT->num_entries = a->nz;
1302       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1303       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1304       tempT->values = new THRUSTARRAY(a->nz);
1305 
1306       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1307                               temp->num_cols, temp->num_entries,
1308                               temp->values->data().get(),
1309                               temp->row_offsets->data().get(),
1310                               temp->column_indices->data().get(),
1311                               tempT->values->data().get(),
1312                               tempT->column_indices->data().get(),
1313                               tempT->row_offsets->data().get(),
1314                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1315 
1316       /* Last, convert CSC to HYB */
1317       cusparseHybMat_t hybMat;
1318       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1319       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1320         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1321       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1322                               matstructT->descr, tempT->values->data().get(),
1323                               tempT->row_offsets->data().get(),
1324                               tempT->column_indices->data().get(),
1325                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1326 
1327       /* assign the pointer */
1328       matstructT->mat = hybMat;
1329       A->transupdated = PETSC_TRUE;
1330       /* delete temporaries */
1331       if (tempT) {
1332         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1333         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1334         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1335         delete (CsrMatrix*) tempT;
1336       }
1337       if (temp) {
1338         if (temp->values) delete (THRUSTARRAY*) temp->values;
1339         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1340         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1341         delete (CsrMatrix*) temp;
1342       }
1343      #endif
1344     }
1345   }
1346   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1347     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1348     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1349     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1350     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1351     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1352     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1353     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1354     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1355     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1356     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1357     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1358       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1359       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1360       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1361     }
1362     if (!cusparsestruct->csr2csc_i) {
1363       THRUSTARRAY csr2csc_a(matrix->num_entries);
1364       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1365 
1366       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1367      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1368       void   *csr2cscBuffer;
1369       size_t csr2cscBufferSize;
1370       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1371                                            A->cmap->n, matrix->num_entries,
1372                                            matrix->values->data().get(),
1373                                            cusparsestruct->rowoffsets_gpu->data().get(),
1374                                            matrix->column_indices->data().get(),
1375                                            matrixT->values->data().get(),
1376                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1377                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1378                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1379       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1380      #endif
1381 
1382       if (matrix->num_entries) {
1383         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1384            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1385            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1386 
1387            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1388            should be filled with indexBase. So I just take a shortcut here.
1389         */
1390         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1391                               A->cmap->n,matrix->num_entries,
1392                               csr2csc_a.data().get(),
1393                               cusparsestruct->rowoffsets_gpu->data().get(),
1394                               matrix->column_indices->data().get(),
1395                               matrixT->values->data().get(),
1396                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1398                               CUSPARSE_ACTION_NUMERIC,indexBase,
1399                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1400                              #else
1401                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1402                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1403                              #endif
1404       } else {
1405         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1406       }
1407 
1408       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1409       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1410      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1411       PetscCallCUDA(cudaFree(csr2cscBuffer));
1412      #endif
1413     }
1414     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1415                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1416                                                      matrixT->values->begin()));
1417   }
1418   PetscCall(PetscLogGpuTimeEnd());
1419   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1420   /* the compressed row indices is not used for matTranspose */
1421   matstructT->cprowIndices = NULL;
1422   /* assign the pointer */
1423   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1424   A->transupdated = PETSC_TRUE;
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1429 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1430 {
1431   PetscInt                              n = xx->map->n;
1432   const PetscScalar                     *barray;
1433   PetscScalar                           *xarray;
1434   thrust::device_ptr<const PetscScalar> bGPU;
1435   thrust::device_ptr<PetscScalar>       xGPU;
1436   cusparseStatus_t                      stat;
1437   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1438   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1439   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1440   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1441 
1442   PetscFunctionBegin;
1443   /* Analyze the matrix and create the transpose ... on the fly */
1444   if (!loTriFactorT && !upTriFactorT) {
1445     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1446     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1447     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1448   }
1449 
1450   /* Get the GPU pointers */
1451   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1452   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1453   xGPU = thrust::device_pointer_cast(xarray);
1454   bGPU = thrust::device_pointer_cast(barray);
1455 
1456   PetscCall(PetscLogGpuTimeBegin());
1457   /* First, reorder with the row permutation */
1458   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1459                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1460                xGPU);
1461 
1462   /* First, solve U */
1463   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1464                         upTriFactorT->csrMat->num_rows,
1465                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1466                         upTriFactorT->csrMat->num_entries,
1467                       #endif
1468                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1469                         upTriFactorT->csrMat->values->data().get(),
1470                         upTriFactorT->csrMat->row_offsets->data().get(),
1471                         upTriFactorT->csrMat->column_indices->data().get(),
1472                         upTriFactorT->solveInfo,
1473                         xarray,
1474                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475                         tempGPU->data().get(),
1476                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1477                       #else
1478                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1479                       #endif
1480 
1481   /* Then, solve L */
1482   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1483                         loTriFactorT->csrMat->num_rows,
1484                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1485                         loTriFactorT->csrMat->num_entries,
1486                       #endif
1487                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1488                         loTriFactorT->csrMat->values->data().get(),
1489                         loTriFactorT->csrMat->row_offsets->data().get(),
1490                         loTriFactorT->csrMat->column_indices->data().get(),
1491                         loTriFactorT->solveInfo,
1492                         tempGPU->data().get(),
1493                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1494                         xarray,
1495                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1496                       #else
1497                          xarray);PetscCallCUSPARSE(stat);
1498                       #endif
1499 
1500   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1501   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1502                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1503                tempGPU->begin());
1504 
1505   /* Copy the temporary to the full solution. */
1506   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1507 
1508   /* restore */
1509   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1510   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1511   PetscCall(PetscLogGpuTimeEnd());
1512   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1513   PetscFunctionReturn(0);
1514 }
1515 
1516 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1517 {
1518   const PetscScalar                 *barray;
1519   PetscScalar                       *xarray;
1520   cusparseStatus_t                  stat;
1521   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1522   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1523   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1524   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1525 
1526   PetscFunctionBegin;
1527   /* Analyze the matrix and create the transpose ... on the fly */
1528   if (!loTriFactorT && !upTriFactorT) {
1529     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1530     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1531     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1532   }
1533 
1534   /* Get the GPU pointers */
1535   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1536   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1537 
1538   PetscCall(PetscLogGpuTimeBegin());
1539   /* First, solve U */
1540   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1541                         upTriFactorT->csrMat->num_rows,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         upTriFactorT->csrMat->num_entries,
1544                       #endif
1545                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1546                         upTriFactorT->csrMat->values->data().get(),
1547                         upTriFactorT->csrMat->row_offsets->data().get(),
1548                         upTriFactorT->csrMat->column_indices->data().get(),
1549                         upTriFactorT->solveInfo,
1550                         barray,
1551                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552                         tempGPU->data().get(),
1553                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1554                       #else
1555                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1556                       #endif
1557 
1558   /* Then, solve L */
1559   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1560                         loTriFactorT->csrMat->num_rows,
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         loTriFactorT->csrMat->num_entries,
1563                       #endif
1564                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1565                         loTriFactorT->csrMat->values->data().get(),
1566                         loTriFactorT->csrMat->row_offsets->data().get(),
1567                         loTriFactorT->csrMat->column_indices->data().get(),
1568                         loTriFactorT->solveInfo,
1569                         tempGPU->data().get(),
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         xarray,
1572                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1573                       #else
1574                         xarray);PetscCallCUSPARSE(stat);
1575                       #endif
1576 
1577   /* restore */
1578   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1579   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1580   PetscCall(PetscLogGpuTimeEnd());
1581   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1582   PetscFunctionReturn(0);
1583 }
1584 
1585 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1586 {
1587   const PetscScalar                     *barray;
1588   PetscScalar                           *xarray;
1589   thrust::device_ptr<const PetscScalar> bGPU;
1590   thrust::device_ptr<PetscScalar>       xGPU;
1591   cusparseStatus_t                      stat;
1592   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1594   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1595   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1596 
1597   PetscFunctionBegin;
1598 
1599   /* Get the GPU pointers */
1600   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1601   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1602   xGPU = thrust::device_pointer_cast(xarray);
1603   bGPU = thrust::device_pointer_cast(barray);
1604 
1605   PetscCall(PetscLogGpuTimeBegin());
1606   /* First, reorder with the row permutation */
1607   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1608                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1609                tempGPU->begin());
1610 
1611   /* Next, solve L */
1612   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1613                         loTriFactor->csrMat->num_rows,
1614                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615                         loTriFactor->csrMat->num_entries,
1616                       #endif
1617                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1618                         loTriFactor->csrMat->values->data().get(),
1619                         loTriFactor->csrMat->row_offsets->data().get(),
1620                         loTriFactor->csrMat->column_indices->data().get(),
1621                         loTriFactor->solveInfo,
1622                         tempGPU->data().get(),
1623                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624                          xarray,
1625                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1626                       #else
1627                          xarray);PetscCallCUSPARSE(stat);
1628                       #endif
1629 
1630   /* Then, solve U */
1631   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1632                         upTriFactor->csrMat->num_rows,
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         upTriFactor->csrMat->num_entries,
1635                       #endif
1636                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1637                         upTriFactor->csrMat->values->data().get(),
1638                         upTriFactor->csrMat->row_offsets->data().get(),
1639                         upTriFactor->csrMat->column_indices->data().get(),
1640                         upTriFactor->solveInfo,xarray,
1641                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1642                         tempGPU->data().get(),
1643                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1644                       #else
1645                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1646                       #endif
1647 
1648   /* Last, reorder with the column permutation */
1649   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1650                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1651                xGPU);
1652 
1653   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1654   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1655   PetscCall(PetscLogGpuTimeEnd());
1656   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1657   PetscFunctionReturn(0);
1658 }
1659 
1660 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1661 {
1662   const PetscScalar                 *barray;
1663   PetscScalar                       *xarray;
1664   cusparseStatus_t                  stat;
1665   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1667   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1668   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1669 
1670   PetscFunctionBegin;
1671   /* Get the GPU pointers */
1672   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1673   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1674 
1675   PetscCall(PetscLogGpuTimeBegin());
1676   /* First, solve L */
1677   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1678                         loTriFactor->csrMat->num_rows,
1679                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1680                         loTriFactor->csrMat->num_entries,
1681                       #endif
1682                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1683                         loTriFactor->csrMat->values->data().get(),
1684                         loTriFactor->csrMat->row_offsets->data().get(),
1685                         loTriFactor->csrMat->column_indices->data().get(),
1686                         loTriFactor->solveInfo,
1687                         barray,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         tempGPU->data().get(),
1690                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1691                       #else
1692                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1693                       #endif
1694 
1695   /* Next, solve U */
1696   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1697                         upTriFactor->csrMat->num_rows,
1698                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1699                         upTriFactor->csrMat->num_entries,
1700                       #endif
1701                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1702                         upTriFactor->csrMat->values->data().get(),
1703                         upTriFactor->csrMat->row_offsets->data().get(),
1704                         upTriFactor->csrMat->column_indices->data().get(),
1705                         upTriFactor->solveInfo,
1706                         tempGPU->data().get(),
1707                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1708                         xarray,
1709                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1710                       #else
1711                         xarray);PetscCallCUSPARSE(stat);
1712                       #endif
1713 
1714   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1715   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1716   PetscCall(PetscLogGpuTimeEnd());
1717   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1718   PetscFunctionReturn(0);
1719 }
1720 
1721 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1722 {
1723   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1724   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1725 
1726   PetscFunctionBegin;
1727   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1728     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1729 
1730     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1731     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1732     PetscCallCUDA(WaitForCUDA());
1733     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1734     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1735     A->offloadmask = PETSC_OFFLOAD_BOTH;
1736   }
1737   PetscFunctionReturn(0);
1738 }
1739 
1740 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1741 {
1742   PetscFunctionBegin;
1743   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1744   *array = ((Mat_SeqAIJ*)A->data)->a;
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1749 {
1750   PetscFunctionBegin;
1751   A->offloadmask = PETSC_OFFLOAD_CPU;
1752   *array         = NULL;
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1757 {
1758   PetscFunctionBegin;
1759   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1760   *array = ((Mat_SeqAIJ*)A->data)->a;
1761   PetscFunctionReturn(0);
1762 }
1763 
1764 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1765 {
1766   PetscFunctionBegin;
1767   *array = NULL;
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1772 {
1773   PetscFunctionBegin;
1774   *array = ((Mat_SeqAIJ*)A->data)->a;
1775   PetscFunctionReturn(0);
1776 }
1777 
1778 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1779 {
1780   PetscFunctionBegin;
1781   A->offloadmask = PETSC_OFFLOAD_CPU;
1782   *array         = NULL;
1783   PetscFunctionReturn(0);
1784 }
1785 
1786 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1787 {
1788   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1789   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1790   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1791   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1792   cusparseStatus_t             stat;
1793   PetscBool                    both = PETSC_TRUE;
1794 
1795   PetscFunctionBegin;
1796   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1797   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1798     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1799       CsrMatrix *matrix;
1800       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1801 
1802       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1803       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1804       matrix->values->assign(a->a, a->a+a->nz);
1805       PetscCallCUDA(WaitForCUDA());
1806       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1807       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1808       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1809     } else {
1810       PetscInt nnz;
1811       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1812       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1813       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1814       delete cusparsestruct->workVector;
1815       delete cusparsestruct->rowoffsets_gpu;
1816       cusparsestruct->workVector = NULL;
1817       cusparsestruct->rowoffsets_gpu = NULL;
1818       try {
1819         if (a->compressedrow.use) {
1820           m    = a->compressedrow.nrows;
1821           ii   = a->compressedrow.i;
1822           ridx = a->compressedrow.rindex;
1823         } else {
1824           m    = A->rmap->n;
1825           ii   = a->i;
1826           ridx = NULL;
1827         }
1828         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1829         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1830         else nnz = a->nz;
1831         PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1832 
1833         /* create cusparse matrix */
1834         cusparsestruct->nrows = m;
1835         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1836         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1837         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1838         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1839 
1840         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1841         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1842         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1843         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1844         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1845         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1846         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1847 
1848         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1849         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1850           /* set the matrix */
1851           CsrMatrix *mat= new CsrMatrix;
1852           mat->num_rows = m;
1853           mat->num_cols = A->cmap->n;
1854           mat->num_entries = nnz;
1855           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1856           mat->row_offsets->assign(ii, ii + m+1);
1857 
1858           mat->column_indices = new THRUSTINTARRAY32(nnz);
1859           mat->column_indices->assign(a->j, a->j+nnz);
1860 
1861           mat->values = new THRUSTARRAY(nnz);
1862           if (a->a) mat->values->assign(a->a, a->a+nnz);
1863 
1864           /* assign the pointer */
1865           matstruct->mat = mat;
1866          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1867           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1868             stat = cusparseCreateCsr(&matstruct->matDescr,
1869                                     mat->num_rows, mat->num_cols, mat->num_entries,
1870                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1871                                     mat->values->data().get(),
1872                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1873                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1874           }
1875          #endif
1876         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1877          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1878           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1879          #else
1880           CsrMatrix *mat= new CsrMatrix;
1881           mat->num_rows = m;
1882           mat->num_cols = A->cmap->n;
1883           mat->num_entries = nnz;
1884           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1885           mat->row_offsets->assign(ii, ii + m+1);
1886 
1887           mat->column_indices = new THRUSTINTARRAY32(nnz);
1888           mat->column_indices->assign(a->j, a->j+nnz);
1889 
1890           mat->values = new THRUSTARRAY(nnz);
1891           if (a->a) mat->values->assign(a->a, a->a+nnz);
1892 
1893           cusparseHybMat_t hybMat;
1894           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1895           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1896             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1897           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1898               matstruct->descr, mat->values->data().get(),
1899               mat->row_offsets->data().get(),
1900               mat->column_indices->data().get(),
1901               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1902           /* assign the pointer */
1903           matstruct->mat = hybMat;
1904 
1905           if (mat) {
1906             if (mat->values) delete (THRUSTARRAY*)mat->values;
1907             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1908             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1909             delete (CsrMatrix*)mat;
1910           }
1911          #endif
1912         }
1913 
1914         /* assign the compressed row indices */
1915         if (a->compressedrow.use) {
1916           cusparsestruct->workVector = new THRUSTARRAY(m);
1917           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1918           matstruct->cprowIndices->assign(ridx,ridx+m);
1919           tmp = m;
1920         } else {
1921           cusparsestruct->workVector = NULL;
1922           matstruct->cprowIndices    = NULL;
1923           tmp = 0;
1924         }
1925         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1926 
1927         /* assign the pointer */
1928         cusparsestruct->mat = matstruct;
1929       } catch(char *ex) {
1930         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1931       }
1932       PetscCallCUDA(WaitForCUDA());
1933       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1934       cusparsestruct->nonzerostate = A->nonzerostate;
1935     }
1936     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1937   }
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 struct VecCUDAPlusEquals
1942 {
1943   template <typename Tuple>
1944   __host__ __device__
1945   void operator()(Tuple t)
1946   {
1947     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1948   }
1949 };
1950 
1951 struct VecCUDAEquals
1952 {
1953   template <typename Tuple>
1954   __host__ __device__
1955   void operator()(Tuple t)
1956   {
1957     thrust::get<1>(t) = thrust::get<0>(t);
1958   }
1959 };
1960 
1961 struct VecCUDAEqualsReverse
1962 {
1963   template <typename Tuple>
1964   __host__ __device__
1965   void operator()(Tuple t)
1966   {
1967     thrust::get<0>(t) = thrust::get<1>(t);
1968   }
1969 };
1970 
1971 struct MatMatCusparse {
1972   PetscBool             cisdense;
1973   PetscScalar           *Bt;
1974   Mat                   X;
1975   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1976   PetscLogDouble        flops;
1977   CsrMatrix             *Bcsr;
1978 
1979 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1980   cusparseSpMatDescr_t  matSpBDescr;
1981   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1982   cusparseDnMatDescr_t  matBDescr;
1983   cusparseDnMatDescr_t  matCDescr;
1984   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1985  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1986   void                  *dBuffer4;
1987   void                  *dBuffer5;
1988  #endif
1989   size_t                mmBufferSize;
1990   void                  *mmBuffer;
1991   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1992   cusparseSpGEMMDescr_t spgemmDesc;
1993 #endif
1994 };
1995 
1996 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1997 {
1998   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1999 
2000   PetscFunctionBegin;
2001   PetscCallCUDA(cudaFree(mmdata->Bt));
2002   delete mmdata->Bcsr;
2003  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2004   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2005   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2006   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2007   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2008  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2009   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2010   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2011  #endif
2012   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2013   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2014  #endif
2015   PetscCall(MatDestroy(&mmdata->X));
2016   PetscCall(PetscFree(data));
2017   PetscFunctionReturn(0);
2018 }
2019 
2020 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2021 
2022 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2023 {
2024   Mat_Product                  *product = C->product;
2025   Mat                          A,B;
2026   PetscInt                     m,n,blda,clda;
2027   PetscBool                    flg,biscuda;
2028   Mat_SeqAIJCUSPARSE           *cusp;
2029   cusparseStatus_t             stat;
2030   cusparseOperation_t          opA;
2031   const PetscScalar            *barray;
2032   PetscScalar                  *carray;
2033   MatMatCusparse               *mmdata;
2034   Mat_SeqAIJCUSPARSEMultStruct *mat;
2035   CsrMatrix                    *csrmat;
2036 
2037   PetscFunctionBegin;
2038   MatCheckProduct(C,1);
2039   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2040   mmdata = (MatMatCusparse*)product->data;
2041   A    = product->A;
2042   B    = product->B;
2043   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2044   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2045   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2046      Instead of silently accepting the wrong answer, I prefer to raise the error */
2047   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2048   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2049   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2050   switch (product->type) {
2051   case MATPRODUCT_AB:
2052   case MATPRODUCT_PtAP:
2053     mat = cusp->mat;
2054     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2055     m   = A->rmap->n;
2056     n   = B->cmap->n;
2057     break;
2058   case MATPRODUCT_AtB:
2059     if (!A->form_explicit_transpose) {
2060       mat = cusp->mat;
2061       opA = CUSPARSE_OPERATION_TRANSPOSE;
2062     } else {
2063       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2064       mat  = cusp->matTranspose;
2065       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2066     }
2067     m = A->cmap->n;
2068     n = B->cmap->n;
2069     break;
2070   case MATPRODUCT_ABt:
2071   case MATPRODUCT_RARt:
2072     mat = cusp->mat;
2073     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2074     m   = A->rmap->n;
2075     n   = B->rmap->n;
2076     break;
2077   default:
2078     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2079   }
2080   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2081   csrmat = (CsrMatrix*)mat->mat;
2082   /* if the user passed a CPU matrix, copy the data to the GPU */
2083   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2084   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2085   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2086 
2087   PetscCall(MatDenseGetLDA(B,&blda));
2088   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2089     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2090     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2091   } else {
2092     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2093     PetscCall(MatDenseGetLDA(C,&clda));
2094   }
2095 
2096   PetscCall(PetscLogGpuTimeBegin());
2097  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2098   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2099   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2100   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2101     size_t mmBufferSize;
2102     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2103     if (!mmdata->matBDescr) {
2104       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2105       mmdata->Blda = blda;
2106     }
2107 
2108     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2109     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2110       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2111       mmdata->Clda = clda;
2112     }
2113 
2114     if (!mat->matDescr) {
2115       stat = cusparseCreateCsr(&mat->matDescr,
2116                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2117                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2118                                csrmat->values->data().get(),
2119                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2120                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2121     }
2122     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2123                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2124                                    mmdata->matCDescr,cusparse_scalartype,
2125                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2126     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2127       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2128       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2129       mmdata->mmBufferSize = mmBufferSize;
2130     }
2131     mmdata->initialized = PETSC_TRUE;
2132   } else {
2133     /* to be safe, always update pointers of the mats */
2134     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2135     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2136     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2137   }
2138 
2139   /* do cusparseSpMM, which supports transpose on B */
2140   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2141                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2142                       mmdata->matCDescr,cusparse_scalartype,
2143                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2144  #else
2145   PetscInt k;
2146   /* cusparseXcsrmm does not support transpose on B */
2147   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2148     cublasHandle_t cublasv2handle;
2149     cublasStatus_t cerr;
2150 
2151     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2152     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2153                        B->cmap->n,B->rmap->n,
2154                        &PETSC_CUSPARSE_ONE ,barray,blda,
2155                        &PETSC_CUSPARSE_ZERO,barray,blda,
2156                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2157     blda = B->cmap->n;
2158     k    = B->cmap->n;
2159   } else {
2160     k    = B->rmap->n;
2161   }
2162 
2163   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2164   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2165                            csrmat->num_entries,mat->alpha_one,mat->descr,
2166                            csrmat->values->data().get(),
2167                            csrmat->row_offsets->data().get(),
2168                            csrmat->column_indices->data().get(),
2169                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2170                            carray,clda);PetscCallCUSPARSE(stat);
2171  #endif
2172   PetscCall(PetscLogGpuTimeEnd());
2173   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2174   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2175   if (product->type == MATPRODUCT_RARt) {
2176     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2177     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2178   } else if (product->type == MATPRODUCT_PtAP) {
2179     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2180     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2181   } else {
2182     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2183   }
2184   if (mmdata->cisdense) {
2185     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2186   }
2187   if (!biscuda) {
2188     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2189   }
2190   PetscFunctionReturn(0);
2191 }
2192 
2193 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2194 {
2195   Mat_Product        *product = C->product;
2196   Mat                A,B;
2197   PetscInt           m,n;
2198   PetscBool          cisdense,flg;
2199   MatMatCusparse     *mmdata;
2200   Mat_SeqAIJCUSPARSE *cusp;
2201 
2202   PetscFunctionBegin;
2203   MatCheckProduct(C,1);
2204   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2205   A    = product->A;
2206   B    = product->B;
2207   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2208   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2209   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2210   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2211   switch (product->type) {
2212   case MATPRODUCT_AB:
2213     m = A->rmap->n;
2214     n = B->cmap->n;
2215     break;
2216   case MATPRODUCT_AtB:
2217     m = A->cmap->n;
2218     n = B->cmap->n;
2219     break;
2220   case MATPRODUCT_ABt:
2221     m = A->rmap->n;
2222     n = B->rmap->n;
2223     break;
2224   case MATPRODUCT_PtAP:
2225     m = B->cmap->n;
2226     n = B->cmap->n;
2227     break;
2228   case MATPRODUCT_RARt:
2229     m = B->rmap->n;
2230     n = B->rmap->n;
2231     break;
2232   default:
2233     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2234   }
2235   PetscCall(MatSetSizes(C,m,n,m,n));
2236   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2237   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2238   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2239 
2240   /* product data */
2241   PetscCall(PetscNew(&mmdata));
2242   mmdata->cisdense = cisdense;
2243  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2244   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2245   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2246     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2247   }
2248  #endif
2249   /* for these products we need intermediate storage */
2250   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2251     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2252     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2253     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2254       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2255     } else {
2256       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2257     }
2258   }
2259   C->product->data    = mmdata;
2260   C->product->destroy = MatDestroy_MatMatCusparse;
2261 
2262   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2263   PetscFunctionReturn(0);
2264 }
2265 
2266 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2267 {
2268   Mat_Product                  *product = C->product;
2269   Mat                          A,B;
2270   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2271   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2272   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2273   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2274   PetscBool                    flg;
2275   cusparseStatus_t             stat;
2276   MatProductType               ptype;
2277   MatMatCusparse               *mmdata;
2278 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2279   cusparseSpMatDescr_t         BmatSpDescr;
2280 #endif
2281   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2282 
2283   PetscFunctionBegin;
2284   MatCheckProduct(C,1);
2285   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2286   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2287   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2288   mmdata = (MatMatCusparse*)C->product->data;
2289   A = product->A;
2290   B = product->B;
2291   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2292     mmdata->reusesym = PETSC_FALSE;
2293     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2294     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2295     Cmat = Ccusp->mat;
2296     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2297     Ccsr = (CsrMatrix*)Cmat->mat;
2298     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2299     goto finalize;
2300   }
2301   if (!c->nz) goto finalize;
2302   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2303   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2304   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2305   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2306   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2307   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2308   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2309   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2310   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2311   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2312   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2313   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2314   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2315   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2316 
2317   ptype = product->type;
2318   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2319     ptype = MATPRODUCT_AB;
2320     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2321   }
2322   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2323     ptype = MATPRODUCT_AB;
2324     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2325   }
2326   switch (ptype) {
2327   case MATPRODUCT_AB:
2328     Amat = Acusp->mat;
2329     Bmat = Bcusp->mat;
2330     break;
2331   case MATPRODUCT_AtB:
2332     Amat = Acusp->matTranspose;
2333     Bmat = Bcusp->mat;
2334     break;
2335   case MATPRODUCT_ABt:
2336     Amat = Acusp->mat;
2337     Bmat = Bcusp->matTranspose;
2338     break;
2339   default:
2340     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2341   }
2342   Cmat = Ccusp->mat;
2343   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2344   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2345   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2346   Acsr = (CsrMatrix*)Amat->mat;
2347   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2348   Ccsr = (CsrMatrix*)Cmat->mat;
2349   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2350   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2351   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2352   PetscCall(PetscLogGpuTimeBegin());
2353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2354   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2355   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2356   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2357     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2358                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2359                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2360                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2361   #else
2362     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2363                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2364                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2365                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2366     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2367                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2368                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2369   #endif
2370 #else
2371   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2372                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2373                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2374                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2375                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2376 #endif
2377   PetscCall(PetscLogGpuFlops(mmdata->flops));
2378   PetscCallCUDA(WaitForCUDA());
2379   PetscCall(PetscLogGpuTimeEnd());
2380   C->offloadmask = PETSC_OFFLOAD_GPU;
2381 finalize:
2382   /* shorter version of MatAssemblyEnd_SeqAIJ */
2383   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2384   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2385   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2386   c->reallocs         = 0;
2387   C->info.mallocs    += 0;
2388   C->info.nz_unneeded = 0;
2389   C->assembled = C->was_assembled = PETSC_TRUE;
2390   C->num_ass++;
2391   PetscFunctionReturn(0);
2392 }
2393 
2394 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2395 {
2396   Mat_Product                  *product = C->product;
2397   Mat                          A,B;
2398   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2399   Mat_SeqAIJ                   *a,*b,*c;
2400   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2401   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2402   PetscInt                     i,j,m,n,k;
2403   PetscBool                    flg;
2404   cusparseStatus_t             stat;
2405   MatProductType               ptype;
2406   MatMatCusparse               *mmdata;
2407   PetscLogDouble               flops;
2408   PetscBool                    biscompressed,ciscompressed;
2409 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2410   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2411   cusparseSpMatDescr_t         BmatSpDescr;
2412 #else
2413   int                          cnz;
2414 #endif
2415   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2416 
2417   PetscFunctionBegin;
2418   MatCheckProduct(C,1);
2419   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2420   A    = product->A;
2421   B    = product->B;
2422   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2423   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2424   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2425   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2426   a = (Mat_SeqAIJ*)A->data;
2427   b = (Mat_SeqAIJ*)B->data;
2428   /* product data */
2429   PetscCall(PetscNew(&mmdata));
2430   C->product->data    = mmdata;
2431   C->product->destroy = MatDestroy_MatMatCusparse;
2432 
2433   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2434   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2435   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2436   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2437   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2438   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2439 
2440   ptype = product->type;
2441   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2442     ptype = MATPRODUCT_AB;
2443     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2444   }
2445   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2446     ptype = MATPRODUCT_AB;
2447     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2448   }
2449   biscompressed = PETSC_FALSE;
2450   ciscompressed = PETSC_FALSE;
2451   switch (ptype) {
2452   case MATPRODUCT_AB:
2453     m = A->rmap->n;
2454     n = B->cmap->n;
2455     k = A->cmap->n;
2456     Amat = Acusp->mat;
2457     Bmat = Bcusp->mat;
2458     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2459     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2460     break;
2461   case MATPRODUCT_AtB:
2462     m = A->cmap->n;
2463     n = B->cmap->n;
2464     k = A->rmap->n;
2465     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2466     Amat = Acusp->matTranspose;
2467     Bmat = Bcusp->mat;
2468     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2469     break;
2470   case MATPRODUCT_ABt:
2471     m = A->rmap->n;
2472     n = B->rmap->n;
2473     k = A->cmap->n;
2474     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2475     Amat = Acusp->mat;
2476     Bmat = Bcusp->matTranspose;
2477     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2478     break;
2479   default:
2480     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2481   }
2482 
2483   /* create cusparse matrix */
2484   PetscCall(MatSetSizes(C,m,n,m,n));
2485   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2486   c     = (Mat_SeqAIJ*)C->data;
2487   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2488   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2489   Ccsr  = new CsrMatrix;
2490 
2491   c->compressedrow.use = ciscompressed;
2492   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2493     c->compressedrow.nrows = a->compressedrow.nrows;
2494     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2495     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2496     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2497     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2498     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2499   } else {
2500     c->compressedrow.nrows  = 0;
2501     c->compressedrow.i      = NULL;
2502     c->compressedrow.rindex = NULL;
2503     Ccusp->workVector       = NULL;
2504     Cmat->cprowIndices      = NULL;
2505   }
2506   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2507   Ccusp->mat      = Cmat;
2508   Ccusp->mat->mat = Ccsr;
2509   Ccsr->num_rows    = Ccusp->nrows;
2510   Ccsr->num_cols    = n;
2511   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2512   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2513   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2514   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2515   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2516   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2517   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2518   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2519   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2520   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2521   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2522     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2523     c->nz = 0;
2524     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2525     Ccsr->values = new THRUSTARRAY(c->nz);
2526     goto finalizesym;
2527   }
2528 
2529   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2530   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2531   Acsr = (CsrMatrix*)Amat->mat;
2532   if (!biscompressed) {
2533     Bcsr = (CsrMatrix*)Bmat->mat;
2534 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2535     BmatSpDescr = Bmat->matDescr;
2536 #endif
2537   } else { /* we need to use row offsets for the full matrix */
2538     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2539     Bcsr = new CsrMatrix;
2540     Bcsr->num_rows       = B->rmap->n;
2541     Bcsr->num_cols       = cBcsr->num_cols;
2542     Bcsr->num_entries    = cBcsr->num_entries;
2543     Bcsr->column_indices = cBcsr->column_indices;
2544     Bcsr->values         = cBcsr->values;
2545     if (!Bcusp->rowoffsets_gpu) {
2546       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2547       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2548       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2549     }
2550     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2551     mmdata->Bcsr = Bcsr;
2552 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2553     if (Bcsr->num_rows && Bcsr->num_cols) {
2554       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2555                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2556                                Bcsr->values->data().get(),
2557                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2558                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2559     }
2560     BmatSpDescr = mmdata->matSpBDescr;
2561 #endif
2562   }
2563   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2564   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2565   /* precompute flops count */
2566   if (ptype == MATPRODUCT_AB) {
2567     for (i=0, flops = 0; i<A->rmap->n; i++) {
2568       const PetscInt st = a->i[i];
2569       const PetscInt en = a->i[i+1];
2570       for (j=st; j<en; j++) {
2571         const PetscInt brow = a->j[j];
2572         flops += 2.*(b->i[brow+1] - b->i[brow]);
2573       }
2574     }
2575   } else if (ptype == MATPRODUCT_AtB) {
2576     for (i=0, flops = 0; i<A->rmap->n; i++) {
2577       const PetscInt anzi = a->i[i+1] - a->i[i];
2578       const PetscInt bnzi = b->i[i+1] - b->i[i];
2579       flops += (2.*anzi)*bnzi;
2580     }
2581   } else { /* TODO */
2582     flops = 0.;
2583   }
2584 
2585   mmdata->flops = flops;
2586   PetscCall(PetscLogGpuTimeBegin());
2587 
2588 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2589   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2590   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2591                           NULL, NULL, NULL,
2592                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2593                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2594   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2595  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2596  {
2597   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2598      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2599   */
2600   void*  dBuffer1 = NULL;
2601   void*  dBuffer2 = NULL;
2602   void*  dBuffer3 = NULL;
2603   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2604   size_t bufferSize1 = 0;
2605   size_t bufferSize2 = 0;
2606   size_t bufferSize3 = 0;
2607   size_t bufferSize4 = 0;
2608   size_t bufferSize5 = 0;
2609 
2610   /*----------------------------------------------------------------------*/
2611   /* ask bufferSize1 bytes for external memory */
2612   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2613                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2614                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2615   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2616   /* inspect the matrices A and B to understand the memory requirement for the next step */
2617   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2618                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2619                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2620 
2621   /*----------------------------------------------------------------------*/
2622   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2623                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2624                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2625   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2626   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2627   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2628   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2629                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2630                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2631   PetscCallCUDA(cudaFree(dBuffer1));
2632   PetscCallCUDA(cudaFree(dBuffer2));
2633 
2634   /*----------------------------------------------------------------------*/
2635   /* get matrix C non-zero entries C_nnz1 */
2636   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2637   c->nz = (PetscInt) C_nnz1;
2638   /* allocate matrix C */
2639   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2640   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2641   /* update matC with the new pointers */
2642   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2643                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2644 
2645   /*----------------------------------------------------------------------*/
2646   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2647                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2648                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2649   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2650   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2651                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2652                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2653   PetscCallCUDA(cudaFree(dBuffer3));
2654   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2655                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2656                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2657                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2658   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2659  }
2660  #else
2661   size_t bufSize2;
2662   /* ask bufferSize bytes for external memory */
2663   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2664                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2665                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2666                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2667   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2668   /* inspect the matrices A and B to understand the memory requirement for the next step */
2669   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2670                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2671                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2672                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2673   /* ask bufferSize again bytes for external memory */
2674   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2675                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2676                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2677                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2678   /* The CUSPARSE documentation is not clear, nor the API
2679      We need both buffers to perform the operations properly!
2680      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2681      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2682      is stored in the descriptor! What a messy API... */
2683   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2684   /* compute the intermediate product of A * B */
2685   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2686                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2687                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2688                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2689   /* get matrix C non-zero entries C_nnz1 */
2690   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2691   c->nz = (PetscInt) C_nnz1;
2692   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2693   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2694   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2695   Ccsr->values = new THRUSTARRAY(c->nz);
2696   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2697   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2698                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2699   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2700                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2701                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2702  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2703 #else
2704   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2705   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2706                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2707                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2708                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2709                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2710   c->nz = cnz;
2711   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2712   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2713   Ccsr->values = new THRUSTARRAY(c->nz);
2714   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2715 
2716   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2717   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2718      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2719      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2720   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2721                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2722                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2723                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2724                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2725 #endif
2726   PetscCall(PetscLogGpuFlops(mmdata->flops));
2727   PetscCall(PetscLogGpuTimeEnd());
2728 finalizesym:
2729   c->singlemalloc = PETSC_FALSE;
2730   c->free_a       = PETSC_TRUE;
2731   c->free_ij      = PETSC_TRUE;
2732   PetscCall(PetscMalloc1(m+1,&c->i));
2733   PetscCall(PetscMalloc1(c->nz,&c->j));
2734   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2735     PetscInt *d_i = c->i;
2736     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2737     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2738     ii   = *Ccsr->row_offsets;
2739     jj   = *Ccsr->column_indices;
2740     if (ciscompressed) d_i = c->compressedrow.i;
2741     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2742     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2743   } else {
2744     PetscInt *d_i = c->i;
2745     if (ciscompressed) d_i = c->compressedrow.i;
2746     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2747     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2748   }
2749   if (ciscompressed) { /* need to expand host row offsets */
2750     PetscInt r = 0;
2751     c->i[0] = 0;
2752     for (k = 0; k < c->compressedrow.nrows; k++) {
2753       const PetscInt next = c->compressedrow.rindex[k];
2754       const PetscInt old = c->compressedrow.i[k];
2755       for (; r < next; r++) c->i[r+1] = old;
2756     }
2757     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2758   }
2759   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2760   PetscCall(PetscMalloc1(m,&c->ilen));
2761   PetscCall(PetscMalloc1(m,&c->imax));
2762   c->maxnz = c->nz;
2763   c->nonzerorowcnt = 0;
2764   c->rmax = 0;
2765   for (k = 0; k < m; k++) {
2766     const PetscInt nn = c->i[k+1] - c->i[k];
2767     c->ilen[k] = c->imax[k] = nn;
2768     c->nonzerorowcnt += (PetscInt)!!nn;
2769     c->rmax = PetscMax(c->rmax,nn);
2770   }
2771   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2772   PetscCall(PetscMalloc1(c->nz,&c->a));
2773   Ccsr->num_entries = c->nz;
2774 
2775   C->nonzerostate++;
2776   PetscCall(PetscLayoutSetUp(C->rmap));
2777   PetscCall(PetscLayoutSetUp(C->cmap));
2778   Ccusp->nonzerostate = C->nonzerostate;
2779   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2780   C->preallocated  = PETSC_TRUE;
2781   C->assembled     = PETSC_FALSE;
2782   C->was_assembled = PETSC_FALSE;
2783   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2784     mmdata->reusesym = PETSC_TRUE;
2785     C->offloadmask   = PETSC_OFFLOAD_GPU;
2786   }
2787   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2788   PetscFunctionReturn(0);
2789 }
2790 
2791 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2792 
2793 /* handles sparse or dense B */
2794 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2795 {
2796   Mat_Product    *product = mat->product;
2797   PetscErrorCode ierr;
2798   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2799 
2800   PetscFunctionBegin;
2801   MatCheckProduct(mat,1);
2802   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2803   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2804     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2805   }
2806   if (product->type == MATPRODUCT_ABC) {
2807     Ciscusp = PETSC_FALSE;
2808     if (!product->C->boundtocpu) {
2809       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2810     }
2811   }
2812   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2813     PetscBool usecpu = PETSC_FALSE;
2814     switch (product->type) {
2815     case MATPRODUCT_AB:
2816       if (product->api_user) {
2817         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr);
2818         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2819         ierr = PetscOptionsEnd();PetscCall(ierr);
2820       } else {
2821         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr);
2822         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2823         ierr = PetscOptionsEnd();PetscCall(ierr);
2824       }
2825       break;
2826     case MATPRODUCT_AtB:
2827       if (product->api_user) {
2828         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr);
2829         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2830         ierr = PetscOptionsEnd();PetscCall(ierr);
2831       } else {
2832         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr);
2833         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2834         ierr = PetscOptionsEnd();PetscCall(ierr);
2835       }
2836       break;
2837     case MATPRODUCT_PtAP:
2838       if (product->api_user) {
2839         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr);
2840         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2841         ierr = PetscOptionsEnd();PetscCall(ierr);
2842       } else {
2843         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr);
2844         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2845         ierr = PetscOptionsEnd();PetscCall(ierr);
2846       }
2847       break;
2848     case MATPRODUCT_RARt:
2849       if (product->api_user) {
2850         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr);
2851         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2852         ierr = PetscOptionsEnd();PetscCall(ierr);
2853       } else {
2854         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr);
2855         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2856         ierr = PetscOptionsEnd();PetscCall(ierr);
2857       }
2858       break;
2859     case MATPRODUCT_ABC:
2860       if (product->api_user) {
2861         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr);
2862         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2863         ierr = PetscOptionsEnd();PetscCall(ierr);
2864       } else {
2865         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr);
2866         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2867         ierr = PetscOptionsEnd();PetscCall(ierr);
2868       }
2869       break;
2870     default:
2871       break;
2872     }
2873     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2874   }
2875   /* dispatch */
2876   if (isdense) {
2877     switch (product->type) {
2878     case MATPRODUCT_AB:
2879     case MATPRODUCT_AtB:
2880     case MATPRODUCT_ABt:
2881     case MATPRODUCT_PtAP:
2882     case MATPRODUCT_RARt:
2883      if (product->A->boundtocpu) {
2884         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2885       } else {
2886         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2887       }
2888       break;
2889     case MATPRODUCT_ABC:
2890       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2891       break;
2892     default:
2893       break;
2894     }
2895   } else if (Biscusp && Ciscusp) {
2896     switch (product->type) {
2897     case MATPRODUCT_AB:
2898     case MATPRODUCT_AtB:
2899     case MATPRODUCT_ABt:
2900       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2901       break;
2902     case MATPRODUCT_PtAP:
2903     case MATPRODUCT_RARt:
2904     case MATPRODUCT_ABC:
2905       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2906       break;
2907     default:
2908       break;
2909     }
2910   } else { /* fallback for AIJ */
2911     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2912   }
2913   PetscFunctionReturn(0);
2914 }
2915 
2916 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2917 {
2918   PetscFunctionBegin;
2919   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2920   PetscFunctionReturn(0);
2921 }
2922 
2923 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2924 {
2925   PetscFunctionBegin;
2926   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2927   PetscFunctionReturn(0);
2928 }
2929 
2930 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2931 {
2932   PetscFunctionBegin;
2933   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2934   PetscFunctionReturn(0);
2935 }
2936 
2937 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2938 {
2939   PetscFunctionBegin;
2940   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2941   PetscFunctionReturn(0);
2942 }
2943 
2944 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2945 {
2946   PetscFunctionBegin;
2947   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2948   PetscFunctionReturn(0);
2949 }
2950 
2951 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2952 {
2953   int i = blockIdx.x*blockDim.x + threadIdx.x;
2954   if (i < n) y[idx[i]] += x[i];
2955 }
2956 
2957 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2958 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2959 {
2960   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2961   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2962   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2963   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2964   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2965   PetscBool                    compressed;
2966 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2967   PetscInt                     nx,ny;
2968 #endif
2969 
2970   PetscFunctionBegin;
2971   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2972   if (!a->nz) {
2973     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
2974     else PetscCall(VecCopy_SeqCUDA(yy,zz));
2975     PetscFunctionReturn(0);
2976   }
2977   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2978   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2979   if (!trans) {
2980     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2981     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2982   } else {
2983     if (herm || !A->form_explicit_transpose) {
2984       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2985       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2986     } else {
2987       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2988       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2989     }
2990   }
2991   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2992   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2993 
2994   try {
2995     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
2996     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
2997     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
2998 
2999     PetscCall(PetscLogGpuTimeBegin());
3000     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3001       /* z = A x + beta y.
3002          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3003          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3004       */
3005       xptr = xarray;
3006       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3007       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3008      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3009       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3010           allocated to accommodate different uses. So we get the length info directly from mat.
3011        */
3012       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3013         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3014         nx = mat->num_cols;
3015         ny = mat->num_rows;
3016       }
3017      #endif
3018     } else {
3019       /* z = A^T x + beta y
3020          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3021          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3022        */
3023       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3024       dptr = zarray;
3025       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3026       if (compressed) { /* Scatter x to work vector */
3027         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3028         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3029                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3030                          VecCUDAEqualsReverse());
3031       }
3032      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3033       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3034         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3035         nx = mat->num_rows;
3036         ny = mat->num_cols;
3037       }
3038      #endif
3039     }
3040 
3041     /* csr_spmv does y = alpha op(A) x + beta y */
3042     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3043      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3044       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3045       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3046         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3047         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3048         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3049                                                matstruct->matDescr,
3050                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3051                                                matstruct->cuSpMV[opA].vecYDescr,
3052                                                cusparse_scalartype,
3053                                                cusparsestruct->spmvAlg,
3054                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3055         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3056 
3057         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3058       } else {
3059         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3060         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3061         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3062       }
3063 
3064       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3065                                   matstruct->alpha_one,
3066                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3067                                   matstruct->cuSpMV[opA].vecXDescr,
3068                                   beta,
3069                                   matstruct->cuSpMV[opA].vecYDescr,
3070                                   cusparse_scalartype,
3071                                   cusparsestruct->spmvAlg,
3072                                   matstruct->cuSpMV[opA].spmvBuffer));
3073      #else
3074       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3075       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3076                                        mat->num_rows, mat->num_cols,
3077                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3078                                        mat->values->data().get(), mat->row_offsets->data().get(),
3079                                        mat->column_indices->data().get(), xptr, beta,
3080                                        dptr));
3081      #endif
3082     } else {
3083       if (cusparsestruct->nrows) {
3084        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3085         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3086        #else
3087         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3088         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3089                                          matstruct->alpha_one, matstruct->descr, hybMat,
3090                                          xptr, beta,
3091                                          dptr));
3092        #endif
3093       }
3094     }
3095     PetscCall(PetscLogGpuTimeEnd());
3096 
3097     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3098       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3099         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3100           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3101         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3102           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3103         }
3104       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3105         PetscCall(VecSet_SeqCUDA(zz,0));
3106       }
3107 
3108       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3109       if (compressed) {
3110         PetscCall(PetscLogGpuTimeBegin());
3111         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3112            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3113            prevent that. So I just add a ScatterAdd kernel.
3114          */
3115        #if 0
3116         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3117         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3118                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3119                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3120                          VecCUDAPlusEquals());
3121        #else
3122         PetscInt n = matstruct->cprowIndices->size();
3123         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3124        #endif
3125         PetscCall(PetscLogGpuTimeEnd());
3126       }
3127     } else {
3128       if (yy && yy != zz) {
3129         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3130       }
3131     }
3132     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3133     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3134     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3135   } catch(char *ex) {
3136     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3137   }
3138   if (yy) {
3139     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3140   } else {
3141     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3142   }
3143   PetscFunctionReturn(0);
3144 }
3145 
3146 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3147 {
3148   PetscFunctionBegin;
3149   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3150   PetscFunctionReturn(0);
3151 }
3152 
3153 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3154 {
3155   PetscObjectState   onnz = A->nonzerostate;
3156   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3157 
3158   PetscFunctionBegin;
3159   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3160   if (onnz != A->nonzerostate && cusp->deviceMat) {
3161 
3162     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3163     PetscCallCUDA(cudaFree(cusp->deviceMat));
3164     cusp->deviceMat = NULL;
3165   }
3166   PetscFunctionReturn(0);
3167 }
3168 
3169 /* --------------------------------------------------------------------------------*/
3170 /*@
3171    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3172    (the default parallel PETSc format). This matrix will ultimately pushed down
3173    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3174    assembly performance the user should preallocate the matrix storage by setting
3175    the parameter nz (or the array nnz).  By setting these parameters accurately,
3176    performance during matrix assembly can be increased by more than a factor of 50.
3177 
3178    Collective
3179 
3180    Input Parameters:
3181 +  comm - MPI communicator, set to PETSC_COMM_SELF
3182 .  m - number of rows
3183 .  n - number of columns
3184 .  nz - number of nonzeros per row (same for all rows)
3185 -  nnz - array containing the number of nonzeros in the various rows
3186          (possibly different for each row) or NULL
3187 
3188    Output Parameter:
3189 .  A - the matrix
3190 
3191    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3192    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3193    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3194 
3195    Notes:
3196    If nnz is given then nz is ignored
3197 
3198    The AIJ format (also called the Yale sparse matrix format or
3199    compressed row storage), is fully compatible with standard Fortran 77
3200    storage.  That is, the stored row and column indices can begin at
3201    either one (as in Fortran) or zero.  See the users' manual for details.
3202 
3203    Specify the preallocated storage with either nz or nnz (not both).
3204    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3205    allocation.  For large problems you MUST preallocate memory or you
3206    will get TERRIBLE performance, see the users' manual chapter on matrices.
3207 
3208    By default, this format uses inodes (identical nodes) when possible, to
3209    improve numerical efficiency of matrix-vector products and solves. We
3210    search for consecutive rows with the same nonzero structure, thereby
3211    reusing matrix information to achieve increased efficiency.
3212 
3213    Level: intermediate
3214 
3215 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3216 @*/
3217 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3218 {
3219   PetscFunctionBegin;
3220   PetscCall(MatCreate(comm,A));
3221   PetscCall(MatSetSizes(*A,m,n,m,n));
3222   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3223   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3224   PetscFunctionReturn(0);
3225 }
3226 
3227 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3228 {
3229   PetscFunctionBegin;
3230   if (A->factortype == MAT_FACTOR_NONE) {
3231     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3232   } else {
3233     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3234   }
3235   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3236   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3237   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3238   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3239   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3240   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3241   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3242   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3243   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3244   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3245   PetscCall(MatDestroy_SeqAIJ(A));
3246   PetscFunctionReturn(0);
3247 }
3248 
3249 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3250 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3251 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3252 {
3253   PetscFunctionBegin;
3254   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3255   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3256   PetscFunctionReturn(0);
3257 }
3258 
3259 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3260 {
3261   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3262   Mat_SeqAIJCUSPARSE *cy;
3263   Mat_SeqAIJCUSPARSE *cx;
3264   PetscScalar        *ay;
3265   const PetscScalar  *ax;
3266   CsrMatrix          *csry,*csrx;
3267 
3268   PetscFunctionBegin;
3269   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3270   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3271   if (X->ops->axpy != Y->ops->axpy) {
3272     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3273     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3274     PetscFunctionReturn(0);
3275   }
3276   /* if we are here, it means both matrices are bound to GPU */
3277   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3278   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3279   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3280   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3281   csry = (CsrMatrix*)cy->mat->mat;
3282   csrx = (CsrMatrix*)cx->mat->mat;
3283   /* see if we can turn this into a cublas axpy */
3284   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3285     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3286     if (eq) {
3287       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3288     }
3289     if (eq) str = SAME_NONZERO_PATTERN;
3290   }
3291   /* spgeam is buggy with one column */
3292   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3293 
3294   if (str == SUBSET_NONZERO_PATTERN) {
3295     PetscScalar b = 1.0;
3296 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3297     size_t      bufferSize;
3298     void        *buffer;
3299 #endif
3300 
3301     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3302     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3303     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3305     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3306                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3307                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3308                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3309     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3310     PetscCall(PetscLogGpuTimeBegin());
3311     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3312                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3313                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3314                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3315     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3316     PetscCall(PetscLogGpuTimeEnd());
3317     PetscCallCUDA(cudaFree(buffer));
3318 #else
3319     PetscCall(PetscLogGpuTimeBegin());
3320     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3321                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3322                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3323                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3324     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3325     PetscCall(PetscLogGpuTimeEnd());
3326 #endif
3327     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3328     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3329     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3330     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3331   } else if (str == SAME_NONZERO_PATTERN) {
3332     cublasHandle_t cublasv2handle;
3333     PetscBLASInt   one = 1, bnz = 1;
3334 
3335     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3336     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3337     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3338     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3339     PetscCall(PetscLogGpuTimeBegin());
3340     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3341     PetscCall(PetscLogGpuFlops(2.0*bnz));
3342     PetscCall(PetscLogGpuTimeEnd());
3343     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3344     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3345     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3346   } else {
3347     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3348     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3349   }
3350   PetscFunctionReturn(0);
3351 }
3352 
3353 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3354 {
3355   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3356   PetscScalar    *ay;
3357   cublasHandle_t cublasv2handle;
3358   PetscBLASInt   one = 1, bnz = 1;
3359 
3360   PetscFunctionBegin;
3361   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3362   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3363   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3364   PetscCall(PetscLogGpuTimeBegin());
3365   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3366   PetscCall(PetscLogGpuFlops(bnz));
3367   PetscCall(PetscLogGpuTimeEnd());
3368   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3369   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3370   PetscFunctionReturn(0);
3371 }
3372 
3373 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3374 {
3375   PetscBool      both = PETSC_FALSE;
3376   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3377 
3378   PetscFunctionBegin;
3379   if (A->factortype == MAT_FACTOR_NONE) {
3380     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3381     if (spptr->mat) {
3382       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3383       if (matrix->values) {
3384         both = PETSC_TRUE;
3385         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3386       }
3387     }
3388     if (spptr->matTranspose) {
3389       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3390       if (matrix->values) {
3391         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3392       }
3393     }
3394   }
3395   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3396   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3397   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3398   else A->offloadmask = PETSC_OFFLOAD_CPU;
3399   PetscFunctionReturn(0);
3400 }
3401 
3402 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3403 {
3404   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3405 
3406   PetscFunctionBegin;
3407   if (A->factortype != MAT_FACTOR_NONE) {
3408     A->boundtocpu = flg;
3409     PetscFunctionReturn(0);
3410   }
3411   if (flg) {
3412     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3413 
3414     A->ops->scale                     = MatScale_SeqAIJ;
3415     A->ops->axpy                      = MatAXPY_SeqAIJ;
3416     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3417     A->ops->mult                      = MatMult_SeqAIJ;
3418     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3419     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3420     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3421     A->ops->multhermitiantranspose    = NULL;
3422     A->ops->multhermitiantransposeadd = NULL;
3423     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3424     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3425     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3426     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3427     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3428     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3429     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3430     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
3431     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3432   } else {
3433     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3434     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3435     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3436     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3437     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3438     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3439     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3440     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3441     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3442     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3443     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3444     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3445     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3446     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3447     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3448     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3449     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3450     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3451     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3452     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3453     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3454     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3455   }
3456   A->boundtocpu = flg;
3457   if (flg && a->inode.size) {
3458     a->inode.use = PETSC_TRUE;
3459   } else {
3460     a->inode.use = PETSC_FALSE;
3461   }
3462   PetscFunctionReturn(0);
3463 }
3464 
3465 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3466 {
3467   Mat              B;
3468 
3469   PetscFunctionBegin;
3470   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3471   if (reuse == MAT_INITIAL_MATRIX) {
3472     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3473   } else if (reuse == MAT_REUSE_MATRIX) {
3474     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3475   }
3476   B = *newmat;
3477 
3478   PetscCall(PetscFree(B->defaultvectype));
3479   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3480 
3481   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3482     if (B->factortype == MAT_FACTOR_NONE) {
3483       Mat_SeqAIJCUSPARSE *spptr;
3484       PetscCall(PetscNew(&spptr));
3485       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3486       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3487       spptr->format     = MAT_CUSPARSE_CSR;
3488      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3489      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3490       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3491      #else
3492       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3493      #endif
3494       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3495       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3496      #endif
3497       B->spptr = spptr;
3498     } else {
3499       Mat_SeqAIJCUSPARSETriFactors *spptr;
3500 
3501       PetscCall(PetscNew(&spptr));
3502       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3503       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3504       B->spptr = spptr;
3505     }
3506     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3507   }
3508   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3509   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3510   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3511   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3512   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3513   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3514 
3515   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3516   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3517   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3518 #if defined(PETSC_HAVE_HYPRE)
3519   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3520 #endif
3521   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3522   PetscFunctionReturn(0);
3523 }
3524 
3525 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3526 {
3527   PetscFunctionBegin;
3528   PetscCall(MatCreate_SeqAIJ(B));
3529   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3530   PetscFunctionReturn(0);
3531 }
3532 
3533 /*MC
3534    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3535 
3536    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3537    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3538    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3539 
3540    Options Database Keys:
3541 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3542 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3543 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3544 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3545 
3546   Level: beginner
3547 
3548 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3549 M*/
3550 
3551 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3552 
3553 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3554 {
3555   PetscFunctionBegin;
3556   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3557   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3558   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3559   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3560   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3561 
3562   PetscFunctionReturn(0);
3563 }
3564 
3565 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3566 {
3567   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3568 
3569   PetscFunctionBegin;
3570   if (!cusp) PetscFunctionReturn(0);
3571   delete cusp->cooPerm;
3572   delete cusp->cooPerm_a;
3573   cusp->cooPerm = NULL;
3574   cusp->cooPerm_a = NULL;
3575   if (cusp->use_extended_coo) {
3576     PetscCallCUDA(cudaFree(cusp->jmap_d));
3577     PetscCallCUDA(cudaFree(cusp->perm_d));
3578   }
3579   cusp->use_extended_coo = PETSC_FALSE;
3580   PetscFunctionReturn(0);
3581 }
3582 
3583 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3584 {
3585   PetscFunctionBegin;
3586   if (*cusparsestruct) {
3587     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3588     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3589     delete (*cusparsestruct)->workVector;
3590     delete (*cusparsestruct)->rowoffsets_gpu;
3591     delete (*cusparsestruct)->cooPerm;
3592     delete (*cusparsestruct)->cooPerm_a;
3593     delete (*cusparsestruct)->csr2csc_i;
3594     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3595     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3596     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3597     PetscCall(PetscFree(*cusparsestruct));
3598   }
3599   PetscFunctionReturn(0);
3600 }
3601 
3602 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3603 {
3604   PetscFunctionBegin;
3605   if (*mat) {
3606     delete (*mat)->values;
3607     delete (*mat)->column_indices;
3608     delete (*mat)->row_offsets;
3609     delete *mat;
3610     *mat = 0;
3611   }
3612   PetscFunctionReturn(0);
3613 }
3614 
3615 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3616 {
3617   PetscFunctionBegin;
3618   if (*trifactor) {
3619     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3620     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3621     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3622     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3623     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3624    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3625     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3626    #endif
3627     PetscCall(PetscFree(*trifactor));
3628   }
3629   PetscFunctionReturn(0);
3630 }
3631 
3632 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3633 {
3634   CsrMatrix        *mat;
3635 
3636   PetscFunctionBegin;
3637   if (*matstruct) {
3638     if ((*matstruct)->mat) {
3639       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3640        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3641         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3642        #else
3643         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3644         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3645        #endif
3646       } else {
3647         mat = (CsrMatrix*)(*matstruct)->mat;
3648         CsrMatrix_Destroy(&mat);
3649       }
3650     }
3651     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3652     delete (*matstruct)->cprowIndices;
3653     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3654     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3655     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3656 
3657    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3658     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3659     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3660     for (int i=0; i<3; i++) {
3661       if (mdata->cuSpMV[i].initialized) {
3662         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3663         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3664         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3665       }
3666     }
3667    #endif
3668     delete *matstruct;
3669     *matstruct = NULL;
3670   }
3671   PetscFunctionReturn(0);
3672 }
3673 
3674 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3675 {
3676   PetscFunctionBegin;
3677   if (*trifactors) {
3678     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3679     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3680     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3681     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3682     delete (*trifactors)->rpermIndices;
3683     delete (*trifactors)->cpermIndices;
3684     delete (*trifactors)->workVector;
3685     (*trifactors)->rpermIndices = NULL;
3686     (*trifactors)->cpermIndices = NULL;
3687     (*trifactors)->workVector = NULL;
3688     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3689     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3690     (*trifactors)->init_dev_prop = PETSC_FALSE;
3691   }
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3696 {
3697   cusparseHandle_t handle;
3698 
3699   PetscFunctionBegin;
3700   if (*trifactors) {
3701     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3702     if (handle = (*trifactors)->handle) {
3703       PetscCallCUSPARSE(cusparseDestroy(handle));
3704     }
3705     PetscCall(PetscFree(*trifactors));
3706   }
3707   PetscFunctionReturn(0);
3708 }
3709 
3710 struct IJCompare
3711 {
3712   __host__ __device__
3713   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3714   {
3715     if (t1.get<0>() < t2.get<0>()) return true;
3716     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3717     return false;
3718   }
3719 };
3720 
3721 struct IJEqual
3722 {
3723   __host__ __device__
3724   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3725   {
3726     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3727     return true;
3728   }
3729 };
3730 
3731 struct IJDiff
3732 {
3733   __host__ __device__
3734   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3735   {
3736     return t1 == t2 ? 0 : 1;
3737   }
3738 };
3739 
3740 struct IJSum
3741 {
3742   __host__ __device__
3743   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3744   {
3745     return t1||t2;
3746   }
3747 };
3748 
3749 #include <thrust/iterator/discard_iterator.h>
3750 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3751 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3752 {
3753   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3754   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3755   THRUSTARRAY                           *cooPerm_v = NULL;
3756   thrust::device_ptr<const PetscScalar> d_v;
3757   CsrMatrix                             *matrix;
3758   PetscInt                              n;
3759 
3760   PetscFunctionBegin;
3761   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3762   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3763   if (!cusp->cooPerm) {
3764     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3765     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3766     PetscFunctionReturn(0);
3767   }
3768   matrix = (CsrMatrix*)cusp->mat->mat;
3769   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3770   if (!v) {
3771     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3772     goto finalize;
3773   }
3774   n = cusp->cooPerm->size();
3775   if (isCudaMem(v)) {
3776     d_v = thrust::device_pointer_cast(v);
3777   } else {
3778     cooPerm_v = new THRUSTARRAY(n);
3779     cooPerm_v->assign(v,v+n);
3780     d_v = cooPerm_v->data();
3781     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3782   }
3783   PetscCall(PetscLogGpuTimeBegin());
3784   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3785     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3786       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3787       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3788       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3789         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3790         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3791       */
3792       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3793       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3794       delete cooPerm_w;
3795     } else {
3796       /* all nonzeros in d_v[] are unique entries */
3797       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3798                                                                 matrix->values->begin()));
3799       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3800                                                                 matrix->values->end()));
3801       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3802     }
3803   } else {
3804     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3805       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3806       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3807     } else {
3808       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3809                                                                 matrix->values->begin()));
3810       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3811                                                                 matrix->values->end()));
3812       thrust::for_each(zibit,zieit,VecCUDAEquals());
3813     }
3814   }
3815   PetscCall(PetscLogGpuTimeEnd());
3816 finalize:
3817   delete cooPerm_v;
3818   A->offloadmask = PETSC_OFFLOAD_GPU;
3819   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3820   /* shorter version of MatAssemblyEnd_SeqAIJ */
3821   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3822   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3823   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3824   a->reallocs         = 0;
3825   A->info.mallocs    += 0;
3826   A->info.nz_unneeded = 0;
3827   A->assembled = A->was_assembled = PETSC_TRUE;
3828   A->num_ass++;
3829   PetscFunctionReturn(0);
3830 }
3831 
3832 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3833 {
3834   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3835 
3836   PetscFunctionBegin;
3837   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3838   if (!cusp) PetscFunctionReturn(0);
3839   if (destroy) {
3840     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3841     delete cusp->csr2csc_i;
3842     cusp->csr2csc_i = NULL;
3843   }
3844   A->transupdated = PETSC_FALSE;
3845   PetscFunctionReturn(0);
3846 }
3847 
3848 #include <thrust/binary_search.h>
3849 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3850 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3851 {
3852   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3853   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3854   PetscInt           cooPerm_n, nzr = 0;
3855 
3856   PetscFunctionBegin;
3857   PetscCall(PetscLayoutSetUp(A->rmap));
3858   PetscCall(PetscLayoutSetUp(A->cmap));
3859   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3860   if (n != cooPerm_n) {
3861     delete cusp->cooPerm;
3862     delete cusp->cooPerm_a;
3863     cusp->cooPerm = NULL;
3864     cusp->cooPerm_a = NULL;
3865   }
3866   if (n) {
3867     THRUSTINTARRAY d_i(n);
3868     THRUSTINTARRAY d_j(n);
3869     THRUSTINTARRAY ii(A->rmap->n);
3870 
3871     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3872     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3873 
3874     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3875     d_i.assign(coo_i,coo_i+n);
3876     d_j.assign(coo_j,coo_j+n);
3877 
3878     /* Ex.
3879       n = 6
3880       coo_i = [3,3,1,4,1,4]
3881       coo_j = [3,2,2,5,2,6]
3882     */
3883     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3884     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3885 
3886     PetscCall(PetscLogGpuTimeBegin());
3887     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3888     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3889     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3890     THRUSTINTARRAY w = d_j;
3891 
3892     /*
3893       d_i     = [1,1,3,3,4,4]
3894       d_j     = [2,2,2,3,5,6]
3895       cooPerm = [2,4,1,0,3,5]
3896     */
3897     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3898 
3899     /*
3900       d_i     = [1,3,3,4,4,x]
3901                             ^ekey
3902       d_j     = [2,2,3,5,6,x]
3903                            ^nekye
3904     */
3905     if (nekey == ekey) { /* all entries are unique */
3906       delete cusp->cooPerm_a;
3907       cusp->cooPerm_a = NULL;
3908     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3909       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3910       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3911       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3912       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3913       w[0] = 0;
3914       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3915       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3916     }
3917     thrust::counting_iterator<PetscInt> search_begin(0);
3918     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3919                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3920                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3921     PetscCall(PetscLogGpuTimeEnd());
3922 
3923     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3924     a->singlemalloc = PETSC_FALSE;
3925     a->free_a       = PETSC_TRUE;
3926     a->free_ij      = PETSC_TRUE;
3927     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3928     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3929     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3930     a->nz = a->maxnz = a->i[A->rmap->n];
3931     a->rmax = 0;
3932     PetscCall(PetscMalloc1(a->nz,&a->a));
3933     PetscCall(PetscMalloc1(a->nz,&a->j));
3934     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3935     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3936     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3937     for (PetscInt i = 0; i < A->rmap->n; i++) {
3938       const PetscInt nnzr = a->i[i+1] - a->i[i];
3939       nzr += (PetscInt)!!(nnzr);
3940       a->ilen[i] = a->imax[i] = nnzr;
3941       a->rmax = PetscMax(a->rmax,nnzr);
3942     }
3943     a->nonzerorowcnt = nzr;
3944     A->preallocated = PETSC_TRUE;
3945     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3946     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3947   } else {
3948     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3949   }
3950   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3951 
3952   /* We want to allocate the CUSPARSE struct for matvec now.
3953      The code is so convoluted now that I prefer to copy zeros */
3954   PetscCall(PetscArrayzero(a->a,a->nz));
3955   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3956   A->offloadmask = PETSC_OFFLOAD_CPU;
3957   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3958   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3959   PetscFunctionReturn(0);
3960 }
3961 
3962 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3963 {
3964   Mat_SeqAIJ         *seq;
3965   Mat_SeqAIJCUSPARSE *dev;
3966   PetscBool          coo_basic = PETSC_TRUE;
3967   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3968 
3969   PetscFunctionBegin;
3970   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
3971   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
3972   if (coo_i) {
3973     PetscCall(PetscGetMemType(coo_i,&mtype));
3974     if (PetscMemTypeHost(mtype)) {
3975       for (PetscCount k=0; k<coo_n; k++) {
3976         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
3977       }
3978     }
3979   }
3980 
3981   if (coo_basic) { /* i,j are on device or do not contain negative indices */
3982     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
3983   } else {
3984     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
3985     mat->offloadmask = PETSC_OFFLOAD_CPU;
3986     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
3987     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
3988     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
3989     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
3990     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
3991     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
3992     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
3993     dev->use_extended_coo = PETSC_TRUE;
3994   }
3995   PetscFunctionReturn(0);
3996 }
3997 
3998 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
3999 {
4000   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4001   const PetscCount  grid_size = gridDim.x * blockDim.x;
4002   for (; i<nnz; i+= grid_size) {
4003     PetscScalar sum = 0.0;
4004     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4005     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4006   }
4007 }
4008 
4009 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4010 {
4011   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4012   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4013   PetscCount          Annz = seq->nz;
4014   PetscMemType        memtype;
4015   const PetscScalar   *v1 = v;
4016   PetscScalar         *Aa;
4017 
4018   PetscFunctionBegin;
4019   if (dev->use_extended_coo) {
4020     PetscCall(PetscGetMemType(v,&memtype));
4021     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4022       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4023       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4024     }
4025 
4026     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4027     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4028 
4029     if (Annz) {
4030       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4031       PetscCallCUDA(cudaPeekAtLastError());
4032     }
4033 
4034     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4035     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4036 
4037     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4038   } else {
4039     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4040   }
4041   PetscFunctionReturn(0);
4042 }
4043 
4044 /*@C
4045     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4046 
4047    Not collective
4048 
4049     Input Parameters:
4050 +   A - the matrix
4051 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4052 
4053     Output Parameters:
4054 +   ia - the CSR row pointers
4055 -   ja - the CSR column indices
4056 
4057     Level: developer
4058 
4059     Notes:
4060       When compressed is true, the CSR structure does not contain empty rows
4061 
4062 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4063 @*/
4064 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4065 {
4066   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4067   CsrMatrix          *csr;
4068   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4069 
4070   PetscFunctionBegin;
4071   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4072   if (!i || !j) PetscFunctionReturn(0);
4073   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4074   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4075   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4076   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4077   csr = (CsrMatrix*)cusp->mat->mat;
4078   if (i) {
4079     if (!compressed && a->compressedrow.use) { /* need full row offset */
4080       if (!cusp->rowoffsets_gpu) {
4081         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4082         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4083         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4084       }
4085       *i = cusp->rowoffsets_gpu->data().get();
4086     } else *i = csr->row_offsets->data().get();
4087   }
4088   if (j) *j = csr->column_indices->data().get();
4089   PetscFunctionReturn(0);
4090 }
4091 
4092 /*@C
4093     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4094 
4095    Not collective
4096 
4097     Input Parameters:
4098 +   A - the matrix
4099 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4100 
4101     Output Parameters:
4102 +   ia - the CSR row pointers
4103 -   ja - the CSR column indices
4104 
4105     Level: developer
4106 
4107 .seealso: MatSeqAIJCUSPARSEGetIJ()
4108 @*/
4109 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4110 {
4111   PetscFunctionBegin;
4112   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4113   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4114   if (i) *i = NULL;
4115   if (j) *j = NULL;
4116   PetscFunctionReturn(0);
4117 }
4118 
4119 /*@C
4120    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4121 
4122    Not Collective
4123 
4124    Input Parameter:
4125 .   A - a MATSEQAIJCUSPARSE matrix
4126 
4127    Output Parameter:
4128 .   a - pointer to the device data
4129 
4130    Level: developer
4131 
4132    Notes: may trigger host-device copies if up-to-date matrix data is on host
4133 
4134 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4135 @*/
4136 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4137 {
4138   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4139   CsrMatrix          *csr;
4140 
4141   PetscFunctionBegin;
4142   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4143   PetscValidPointer(a,2);
4144   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4145   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4146   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4147   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4148   csr = (CsrMatrix*)cusp->mat->mat;
4149   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4150   *a = csr->values->data().get();
4151   PetscFunctionReturn(0);
4152 }
4153 
4154 /*@C
4155    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4156 
4157    Not Collective
4158 
4159    Input Parameter:
4160 .   A - a MATSEQAIJCUSPARSE matrix
4161 
4162    Output Parameter:
4163 .   a - pointer to the device data
4164 
4165    Level: developer
4166 
4167 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4168 @*/
4169 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4170 {
4171   PetscFunctionBegin;
4172   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4173   PetscValidPointer(a,2);
4174   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4175   *a = NULL;
4176   PetscFunctionReturn(0);
4177 }
4178 
4179 /*@C
4180    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4181 
4182    Not Collective
4183 
4184    Input Parameter:
4185 .   A - a MATSEQAIJCUSPARSE matrix
4186 
4187    Output Parameter:
4188 .   a - pointer to the device data
4189 
4190    Level: developer
4191 
4192    Notes: may trigger host-device copies if up-to-date matrix data is on host
4193 
4194 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4195 @*/
4196 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4197 {
4198   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4199   CsrMatrix          *csr;
4200 
4201   PetscFunctionBegin;
4202   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4203   PetscValidPointer(a,2);
4204   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4205   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4206   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4207   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4208   csr = (CsrMatrix*)cusp->mat->mat;
4209   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4210   *a = csr->values->data().get();
4211   A->offloadmask = PETSC_OFFLOAD_GPU;
4212   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4213   PetscFunctionReturn(0);
4214 }
4215 /*@C
4216    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4217 
4218    Not Collective
4219 
4220    Input Parameter:
4221 .   A - a MATSEQAIJCUSPARSE matrix
4222 
4223    Output Parameter:
4224 .   a - pointer to the device data
4225 
4226    Level: developer
4227 
4228 .seealso: MatSeqAIJCUSPARSEGetArray()
4229 @*/
4230 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4231 {
4232   PetscFunctionBegin;
4233   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4234   PetscValidPointer(a,2);
4235   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4236   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4237   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4238   *a = NULL;
4239   PetscFunctionReturn(0);
4240 }
4241 
4242 /*@C
4243    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4244 
4245    Not Collective
4246 
4247    Input Parameter:
4248 .   A - a MATSEQAIJCUSPARSE matrix
4249 
4250    Output Parameter:
4251 .   a - pointer to the device data
4252 
4253    Level: developer
4254 
4255    Notes: does not trigger host-device copies and flags data validity on the GPU
4256 
4257 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4258 @*/
4259 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4260 {
4261   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4262   CsrMatrix          *csr;
4263 
4264   PetscFunctionBegin;
4265   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4266   PetscValidPointer(a,2);
4267   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4268   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4269   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4270   csr = (CsrMatrix*)cusp->mat->mat;
4271   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4272   *a = csr->values->data().get();
4273   A->offloadmask = PETSC_OFFLOAD_GPU;
4274   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4275   PetscFunctionReturn(0);
4276 }
4277 
4278 /*@C
4279    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4280 
4281    Not Collective
4282 
4283    Input Parameter:
4284 .   A - a MATSEQAIJCUSPARSE matrix
4285 
4286    Output Parameter:
4287 .   a - pointer to the device data
4288 
4289    Level: developer
4290 
4291 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4292 @*/
4293 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4294 {
4295   PetscFunctionBegin;
4296   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4297   PetscValidPointer(a,2);
4298   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4299   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4300   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4301   *a = NULL;
4302   PetscFunctionReturn(0);
4303 }
4304 
4305 struct IJCompare4
4306 {
4307   __host__ __device__
4308   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4309   {
4310     if (t1.get<0>() < t2.get<0>()) return true;
4311     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4312     return false;
4313   }
4314 };
4315 
4316 struct Shift
4317 {
4318   int _shift;
4319 
4320   Shift(int shift) : _shift(shift) {}
4321   __host__ __device__
4322   inline int operator() (const int &c)
4323   {
4324     return c + _shift;
4325   }
4326 };
4327 
4328 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4329 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4330 {
4331   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4332   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4333   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4334   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4335   PetscInt                     Annz,Bnnz;
4336   cusparseStatus_t             stat;
4337   PetscInt                     i,m,n,zero = 0;
4338 
4339   PetscFunctionBegin;
4340   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4341   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4342   PetscValidPointer(C,4);
4343   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4344   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4345   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4346   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4347   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4348   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4349   if (reuse == MAT_INITIAL_MATRIX) {
4350     m     = A->rmap->n;
4351     n     = A->cmap->n + B->cmap->n;
4352     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4353     PetscCall(MatSetSizes(*C,m,n,m,n));
4354     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4355     c     = (Mat_SeqAIJ*)(*C)->data;
4356     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4357     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4358     Ccsr  = new CsrMatrix;
4359     Cmat->cprowIndices      = NULL;
4360     c->compressedrow.use    = PETSC_FALSE;
4361     c->compressedrow.nrows  = 0;
4362     c->compressedrow.i      = NULL;
4363     c->compressedrow.rindex = NULL;
4364     Ccusp->workVector       = NULL;
4365     Ccusp->nrows    = m;
4366     Ccusp->mat      = Cmat;
4367     Ccusp->mat->mat = Ccsr;
4368     Ccsr->num_rows  = m;
4369     Ccsr->num_cols  = n;
4370     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4371     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4372     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4373     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4374     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4375     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4376     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4377     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4378     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4379     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4380     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4381     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4382     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4383 
4384     Acsr = (CsrMatrix*)Acusp->mat->mat;
4385     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4386     Annz = (PetscInt)Acsr->column_indices->size();
4387     Bnnz = (PetscInt)Bcsr->column_indices->size();
4388     c->nz = Annz + Bnnz;
4389     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4390     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4391     Ccsr->values = new THRUSTARRAY(c->nz);
4392     Ccsr->num_entries = c->nz;
4393     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4394     if (c->nz) {
4395       auto Acoo = new THRUSTINTARRAY32(Annz);
4396       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4397       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4398       THRUSTINTARRAY32 *Aroff,*Broff;
4399 
4400       if (a->compressedrow.use) { /* need full row offset */
4401         if (!Acusp->rowoffsets_gpu) {
4402           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4403           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4404           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4405         }
4406         Aroff = Acusp->rowoffsets_gpu;
4407       } else Aroff = Acsr->row_offsets;
4408       if (b->compressedrow.use) { /* need full row offset */
4409         if (!Bcusp->rowoffsets_gpu) {
4410           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4411           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4412           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4413         }
4414         Broff = Bcusp->rowoffsets_gpu;
4415       } else Broff = Bcsr->row_offsets;
4416       PetscCall(PetscLogGpuTimeBegin());
4417       stat = cusparseXcsr2coo(Acusp->handle,
4418                               Aroff->data().get(),
4419                               Annz,
4420                               m,
4421                               Acoo->data().get(),
4422                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4423       stat = cusparseXcsr2coo(Bcusp->handle,
4424                               Broff->data().get(),
4425                               Bnnz,
4426                               m,
4427                               Bcoo->data().get(),
4428                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4429       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4430       auto Aperm = thrust::make_constant_iterator(1);
4431       auto Bperm = thrust::make_constant_iterator(0);
4432 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4433       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4434       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4435 #else
4436       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4437       auto Bcib = Bcsr->column_indices->begin();
4438       auto Bcie = Bcsr->column_indices->end();
4439       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4440 #endif
4441       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4442       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4443       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4444       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4445       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4446       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4447       auto p1 = Ccusp->cooPerm->begin();
4448       auto p2 = Ccusp->cooPerm->begin();
4449       thrust::advance(p2,Annz);
4450       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4451 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4452       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4453 #endif
4454       auto cci = thrust::make_counting_iterator(zero);
4455       auto cce = thrust::make_counting_iterator(c->nz);
4456 #if 0 //Errors on SUMMIT cuda 11.1.0
4457       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4458 #else
4459       auto pred = thrust::identity<int>();
4460       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4461       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4462 #endif
4463       stat = cusparseXcoo2csr(Ccusp->handle,
4464                               Ccoo->data().get(),
4465                               c->nz,
4466                               m,
4467                               Ccsr->row_offsets->data().get(),
4468                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4469       PetscCall(PetscLogGpuTimeEnd());
4470       delete wPerm;
4471       delete Acoo;
4472       delete Bcoo;
4473       delete Ccoo;
4474 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4475       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4476                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4477                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4478                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4479 #endif
4480       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4481         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4482         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4483         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4484         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4485         CsrMatrix *CcsrT = new CsrMatrix;
4486         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4487         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4488 
4489         (*C)->form_explicit_transpose = PETSC_TRUE;
4490         (*C)->transupdated = PETSC_TRUE;
4491         Ccusp->rowoffsets_gpu = NULL;
4492         CmatT->cprowIndices = NULL;
4493         CmatT->mat = CcsrT;
4494         CcsrT->num_rows = n;
4495         CcsrT->num_cols = m;
4496         CcsrT->num_entries = c->nz;
4497 
4498         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4499         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4500         CcsrT->values = new THRUSTARRAY(c->nz);
4501 
4502         PetscCall(PetscLogGpuTimeBegin());
4503         auto rT = CcsrT->row_offsets->begin();
4504         if (AT) {
4505           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4506           thrust::advance(rT,-1);
4507         }
4508         if (BT) {
4509           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4510           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4511           thrust::copy(titb,tite,rT);
4512         }
4513         auto cT = CcsrT->column_indices->begin();
4514         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4515         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4516         auto vT = CcsrT->values->begin();
4517         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4518         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4519         PetscCall(PetscLogGpuTimeEnd());
4520 
4521         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4522         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4523         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4524         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4525         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4526         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4527         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4528         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4529         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4530 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4531         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4532                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4533                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4534                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4535 #endif
4536         Ccusp->matTranspose = CmatT;
4537       }
4538     }
4539 
4540     c->singlemalloc = PETSC_FALSE;
4541     c->free_a       = PETSC_TRUE;
4542     c->free_ij      = PETSC_TRUE;
4543     PetscCall(PetscMalloc1(m+1,&c->i));
4544     PetscCall(PetscMalloc1(c->nz,&c->j));
4545     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4546       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4547       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4548       ii   = *Ccsr->row_offsets;
4549       jj   = *Ccsr->column_indices;
4550       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4551       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4552     } else {
4553       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4554       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4555     }
4556     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4557     PetscCall(PetscMalloc1(m,&c->ilen));
4558     PetscCall(PetscMalloc1(m,&c->imax));
4559     c->maxnz = c->nz;
4560     c->nonzerorowcnt = 0;
4561     c->rmax = 0;
4562     for (i = 0; i < m; i++) {
4563       const PetscInt nn = c->i[i+1] - c->i[i];
4564       c->ilen[i] = c->imax[i] = nn;
4565       c->nonzerorowcnt += (PetscInt)!!nn;
4566       c->rmax = PetscMax(c->rmax,nn);
4567     }
4568     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4569     PetscCall(PetscMalloc1(c->nz,&c->a));
4570     (*C)->nonzerostate++;
4571     PetscCall(PetscLayoutSetUp((*C)->rmap));
4572     PetscCall(PetscLayoutSetUp((*C)->cmap));
4573     Ccusp->nonzerostate = (*C)->nonzerostate;
4574     (*C)->preallocated  = PETSC_TRUE;
4575   } else {
4576     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4577     c = (Mat_SeqAIJ*)(*C)->data;
4578     if (c->nz) {
4579       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4580       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4581       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4582       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4583       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4584       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4585       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4586       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4587       Acsr = (CsrMatrix*)Acusp->mat->mat;
4588       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4589       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4590       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4591       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4592       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4593       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4594       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4595       auto pmid = Ccusp->cooPerm->begin();
4596       thrust::advance(pmid,Acsr->num_entries);
4597       PetscCall(PetscLogGpuTimeBegin());
4598       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4599                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4600       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4601                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4602       thrust::for_each(zibait,zieait,VecCUDAEquals());
4603       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4604                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4605       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4606                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4607       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4608       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4609       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4610         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4611         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4612         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4613         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4614         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4615         auto vT = CcsrT->values->begin();
4616         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4617         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4618         (*C)->transupdated = PETSC_TRUE;
4619       }
4620       PetscCall(PetscLogGpuTimeEnd());
4621     }
4622   }
4623   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4624   (*C)->assembled     = PETSC_TRUE;
4625   (*C)->was_assembled = PETSC_FALSE;
4626   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4627   PetscFunctionReturn(0);
4628 }
4629 
4630 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4631 {
4632   bool              dmem;
4633   const PetscScalar *av;
4634 
4635   PetscFunctionBegin;
4636   dmem = isCudaMem(v);
4637   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4638   if (n && idx) {
4639     THRUSTINTARRAY widx(n);
4640     widx.assign(idx,idx+n);
4641     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4642 
4643     THRUSTARRAY *w = NULL;
4644     thrust::device_ptr<PetscScalar> dv;
4645     if (dmem) {
4646       dv = thrust::device_pointer_cast(v);
4647     } else {
4648       w = new THRUSTARRAY(n);
4649       dv = w->data();
4650     }
4651     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4652 
4653     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4654     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4655     thrust::for_each(zibit,zieit,VecCUDAEquals());
4656     if (w) {
4657       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4658     }
4659     delete w;
4660   } else {
4661     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4662   }
4663   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4664   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4665   PetscFunctionReturn(0);
4666 }
4667