xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6a5217c03994f2d95bb2e6dbd8bed42381aeb015)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
96 {
97   PetscFunctionBegin;
98   *type = MATSOLVERCUSPARSE;
99   PetscFunctionReturn(0);
100 }
101 
102 /*MC
103   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108   algorithms are not recommended. This class does NOT support direct solver operations.
109 
110   Level: beginner
111 
112 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
113 M*/
114 
115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
116 {
117   PetscInt       n = A->rmap->n;
118 
119   PetscFunctionBegin;
120   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
121   PetscCall(MatSetSizes(*B,n,n,n,n));
122   (*B)->factortype = ftype;
123   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
124 
125   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
127     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
128     if (!A->boundtocpu) {
129       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
130       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
131     } else {
132       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
133       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
134     }
135     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
136     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
137     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
139     if (!A->boundtocpu) {
140       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
142     } else {
143       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
144       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
145     }
146     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
147     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
148   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149 
150   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
151   (*B)->canuseordering = PETSC_TRUE;
152   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
153   PetscFunctionReturn(0);
154 }
155 
156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157 {
158   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
159 
160   PetscFunctionBegin;
161   switch (op) {
162   case MAT_CUSPARSE_MULT:
163     cusparsestruct->format = format;
164     break;
165   case MAT_CUSPARSE_ALL:
166     cusparsestruct->format = format;
167     break;
168   default:
169     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@
175    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176    operation. Only the MatMult operation can use different GPU storage formats
177    for MPIAIJCUSPARSE matrices.
178    Not Collective
179 
180    Input Parameters:
181 +  A - Matrix of type SEQAIJCUSPARSE
182 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
183 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184 
185    Output Parameter:
186 
187    Level: intermediate
188 
189 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
190 @*/
191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192 {
193   PetscFunctionBegin;
194   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196   PetscFunctionReturn(0);
197 }
198 
199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200 {
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203   PetscFunctionBegin;
204   cusparsestruct->use_cpu_solve = use_cpu;
205   PetscFunctionReturn(0);
206 }
207 
208 /*@
209    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210 
211    Input Parameters:
212 +  A - Matrix of type SEQAIJCUSPARSE
213 -  use_cpu - set flag for using the built-in CPU MatSolve
214 
215    Output Parameter:
216 
217    Notes:
218    The cuSparse LU solver currently computes the factors with the built-in CPU method
219    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221 
222    Level: intermediate
223 
224 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
225 @*/
226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231   PetscFunctionReturn(0);
232 }
233 
234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235 {
236   PetscFunctionBegin;
237   switch (op) {
238     case MAT_FORM_EXPLICIT_TRANSPOSE:
239       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
240       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
241       A->form_explicit_transpose = flg;
242       break;
243     default:
244       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
245       break;
246   }
247   PetscFunctionReturn(0);
248 }
249 
250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251 
252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253 {
254   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255   IS             isrow = b->row,iscol = b->col;
256   PetscBool      row_identity,col_identity;
257   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258 
259   PetscFunctionBegin;
260   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
261   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262   B->offloadmask = PETSC_OFFLOAD_CPU;
263   /* determine which version of MatSolve needs to be used. */
264   PetscCall(ISIdentity(isrow,&row_identity));
265   PetscCall(ISIdentity(iscol,&col_identity));
266   if (row_identity && col_identity) {
267     if (!cusparsestruct->use_cpu_solve) {
268       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
269       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
270     }
271     B->ops->matsolve = NULL;
272     B->ops->matsolvetranspose = NULL;
273   } else {
274     if (!cusparsestruct->use_cpu_solve) {
275       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
276       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
277     }
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   }
281 
282   /* get the triangular factors */
283   if (!cusparsestruct->use_cpu_solve) {
284     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
290 {
291   PetscErrorCode           ierr;
292   MatCUSPARSEStorageFormat format;
293   PetscBool                flg;
294   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
295 
296   PetscFunctionBegin;
297   PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
298   if (A->factortype == MAT_FACTOR_NONE) {
299     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
300                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
301     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
302 
303     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
304                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
305     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
306     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
307     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
309     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
310                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr);
311     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
312 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
313     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
314 #else
315     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
316 #endif
317     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
318                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr);
319     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
320 
321     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
322                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr);
323     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
324    #endif
325   }
326   PetscCall(PetscOptionsTail());
327   PetscFunctionReturn(0);
328 }
329 
330 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331 {
332   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333 
334   PetscFunctionBegin;
335   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
336   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344 
345   PetscFunctionBegin;
346   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
347   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
348   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
349   PetscFunctionReturn(0);
350 }
351 
352 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
353 {
354   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
355 
356   PetscFunctionBegin;
357   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
358   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
359   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
360   PetscFunctionReturn(0);
361 }
362 
363 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
364 {
365   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
366 
367   PetscFunctionBegin;
368   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
369   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
370   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
371   PetscFunctionReturn(0);
372 }
373 
374 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
375 {
376   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
377   PetscInt                          n = A->rmap->n;
378   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
379   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
380   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
381   const MatScalar                   *aa = a->a,*v;
382   PetscInt                          *AiLo, *AjLo;
383   PetscInt                          i,nz, nzLower, offset, rowOffset;
384 
385   PetscFunctionBegin;
386   if (!n) PetscFunctionReturn(0);
387   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
388     try {
389       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
390       nzLower=n+ai[n]-ai[1];
391       if (!loTriFactor) {
392         PetscScalar                       *AALo;
393 
394         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
395 
396         /* Allocate Space for the lower triangular matrix */
397         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
398         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
399 
400         /* Fill the lower triangular matrix */
401         AiLo[0]  = (PetscInt) 0;
402         AiLo[n]  = nzLower;
403         AjLo[0]  = (PetscInt) 0;
404         AALo[0]  = (MatScalar) 1.0;
405         v        = aa;
406         vi       = aj;
407         offset   = 1;
408         rowOffset= 1;
409         for (i=1; i<n; i++) {
410           nz = ai[i+1] - ai[i];
411           /* additional 1 for the term on the diagonal */
412           AiLo[i]    = rowOffset;
413           rowOffset += nz+1;
414 
415           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
416           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
417 
418           offset      += nz;
419           AjLo[offset] = (PetscInt) i;
420           AALo[offset] = (MatScalar) 1.0;
421           offset      += 1;
422 
423           v  += nz;
424           vi += nz;
425         }
426 
427         /* allocate space for the triangular factor information */
428         PetscCall(PetscNew(&loTriFactor));
429         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
430         /* Create the matrix description */
431         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
432         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
433        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
435        #else
436         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
437        #endif
438         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
439         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
440 
441         /* set the operation */
442         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443 
444         /* set the matrix */
445         loTriFactor->csrMat = new CsrMatrix;
446         loTriFactor->csrMat->num_rows = n;
447         loTriFactor->csrMat->num_cols = n;
448         loTriFactor->csrMat->num_entries = nzLower;
449 
450         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
451         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
452 
453         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
454         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
455 
456         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
457         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
458 
459         /* Create the solve analysis information */
460         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
461         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
462       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
463         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
464                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
465                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
466                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
467                                                &loTriFactor->solveBufferSize));
468         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
469       #endif
470 
471         /* perform the solve analysis */
472         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
473                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
474                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
475                                          loTriFactor->csrMat->column_indices->data().get(),
476                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
477                                          loTriFactor->solveInfo,
478                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
479                                          #else
480                                          loTriFactor->solveInfo));
481                                          #endif
482         PetscCallCUDA(WaitForCUDA());
483         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
484 
485         /* assign the pointer */
486         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
487         loTriFactor->AA_h = AALo;
488         PetscCallCUDA(cudaFreeHost(AiLo));
489         PetscCallCUDA(cudaFreeHost(AjLo));
490         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
491       } else { /* update values only */
492         if (!loTriFactor->AA_h) {
493           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
494         }
495         /* Fill the lower triangular matrix */
496         loTriFactor->AA_h[0]  = 1.0;
497         v        = aa;
498         vi       = aj;
499         offset   = 1;
500         for (i=1; i<n; i++) {
501           nz = ai[i+1] - ai[i];
502           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
503           offset      += nz;
504           loTriFactor->AA_h[offset] = 1.0;
505           offset      += 1;
506           v  += nz;
507         }
508         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
509         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
510       }
511     } catch(char *ex) {
512       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
513     }
514   }
515   PetscFunctionReturn(0);
516 }
517 
518 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
519 {
520   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
521   PetscInt                          n = A->rmap->n;
522   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
523   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
524   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
525   const MatScalar                   *aa = a->a,*v;
526   PetscInt                          *AiUp, *AjUp;
527   PetscInt                          i,nz, nzUpper, offset;
528 
529   PetscFunctionBegin;
530   if (!n) PetscFunctionReturn(0);
531   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532     try {
533       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534       nzUpper = adiag[0]-adiag[n];
535       if (!upTriFactor) {
536         PetscScalar *AAUp;
537 
538         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
539 
540         /* Allocate Space for the upper triangular matrix */
541         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
542         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
543 
544         /* Fill the upper triangular matrix */
545         AiUp[0]=(PetscInt) 0;
546         AiUp[n]=nzUpper;
547         offset = nzUpper;
548         for (i=n-1; i>=0; i--) {
549           v  = aa + adiag[i+1] + 1;
550           vi = aj + adiag[i+1] + 1;
551 
552           /* number of elements NOT on the diagonal */
553           nz = adiag[i] - adiag[i+1]-1;
554 
555           /* decrement the offset */
556           offset -= (nz+1);
557 
558           /* first, set the diagonal elements */
559           AjUp[offset] = (PetscInt) i;
560           AAUp[offset] = (MatScalar)1./v[nz];
561           AiUp[i]      = AiUp[i+1] - (nz+1);
562 
563           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
564           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
565         }
566 
567         /* allocate space for the triangular factor information */
568         PetscCall(PetscNew(&upTriFactor));
569         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
570 
571         /* Create the matrix description */
572         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
573         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
574        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
576        #else
577         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
578        #endif
579         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
580         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
581 
582         /* set the operation */
583         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584 
585         /* set the matrix */
586         upTriFactor->csrMat = new CsrMatrix;
587         upTriFactor->csrMat->num_rows = n;
588         upTriFactor->csrMat->num_cols = n;
589         upTriFactor->csrMat->num_entries = nzUpper;
590 
591         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593 
594         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596 
597         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599 
600         /* Create the solve analysis information */
601         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
602         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
603       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608                                                &upTriFactor->solveBufferSize));
609         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
610       #endif
611 
612         /* perform the solve analysis */
613         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616                                          upTriFactor->csrMat->column_indices->data().get(),
617                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618                                          upTriFactor->solveInfo,
619                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
620                                          #else
621                                          upTriFactor->solveInfo));
622                                          #endif
623         PetscCallCUDA(WaitForCUDA());
624         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
625 
626         /* assign the pointer */
627         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628         upTriFactor->AA_h = AAUp;
629         PetscCallCUDA(cudaFreeHost(AiUp));
630         PetscCallCUDA(cudaFreeHost(AjUp));
631         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
632       } else {
633         if (!upTriFactor->AA_h) {
634           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
635         }
636         /* Fill the upper triangular matrix */
637         offset = nzUpper;
638         for (i=n-1; i>=0; i--) {
639           v  = aa + adiag[i+1] + 1;
640 
641           /* number of elements NOT on the diagonal */
642           nz = adiag[i] - adiag[i+1]-1;
643 
644           /* decrement the offset */
645           offset -= (nz+1);
646 
647           /* first, set the diagonal elements */
648           upTriFactor->AA_h[offset] = 1./v[nz];
649           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
650         }
651         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
653       }
654     } catch(char *ex) {
655       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656     }
657   }
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662 {
663   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
664   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
665   IS                           isrow = a->row,iscol = a->icol;
666   PetscBool                    row_identity,col_identity;
667   PetscInt                     n = A->rmap->n;
668 
669   PetscFunctionBegin;
670   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
671   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
672   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
673 
674   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
675   cusparseTriFactors->nnz=a->nz;
676 
677   A->offloadmask = PETSC_OFFLOAD_BOTH;
678   /* lower triangular indices */
679   PetscCall(ISIdentity(isrow,&row_identity));
680   if (!row_identity && !cusparseTriFactors->rpermIndices) {
681     const PetscInt *r;
682 
683     PetscCall(ISGetIndices(isrow,&r));
684     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
685     cusparseTriFactors->rpermIndices->assign(r, r+n);
686     PetscCall(ISRestoreIndices(isrow,&r));
687     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
688   }
689 
690   /* upper triangular indices */
691   PetscCall(ISIdentity(iscol,&col_identity));
692   if (!col_identity && !cusparseTriFactors->cpermIndices) {
693     const PetscInt *c;
694 
695     PetscCall(ISGetIndices(iscol,&c));
696     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
697     cusparseTriFactors->cpermIndices->assign(c, c+n);
698     PetscCall(ISRestoreIndices(iscol,&c));
699     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
700   }
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
705 {
706   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
707   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
708   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
709   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
710   PetscInt                          *AiUp, *AjUp;
711   PetscScalar                       *AAUp;
712   PetscScalar                       *AALo;
713   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
714   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
715   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
716   const MatScalar                   *aa = b->a,*v;
717 
718   PetscFunctionBegin;
719   if (!n) PetscFunctionReturn(0);
720   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
721     try {
722       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
723       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
724       if (!upTriFactor && !loTriFactor) {
725         /* Allocate Space for the upper triangular matrix */
726         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
727         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
728 
729         /* Fill the upper triangular matrix */
730         AiUp[0]=(PetscInt) 0;
731         AiUp[n]=nzUpper;
732         offset = 0;
733         for (i=0; i<n; i++) {
734           /* set the pointers */
735           v  = aa + ai[i];
736           vj = aj + ai[i];
737           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
738 
739           /* first, set the diagonal elements */
740           AjUp[offset] = (PetscInt) i;
741           AAUp[offset] = (MatScalar)1.0/v[nz];
742           AiUp[i]      = offset;
743           AALo[offset] = (MatScalar)1.0/v[nz];
744 
745           offset+=1;
746           if (nz>0) {
747             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
748             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
749             for (j=offset; j<offset+nz; j++) {
750               AAUp[j] = -AAUp[j];
751               AALo[j] = AAUp[j]/v[nz];
752             }
753             offset+=nz;
754           }
755         }
756 
757         /* allocate space for the triangular factor information */
758         PetscCall(PetscNew(&upTriFactor));
759         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
760 
761         /* Create the matrix description */
762         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
763         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
764        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
765         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
766        #else
767         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
768        #endif
769         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
770         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
771 
772         /* set the matrix */
773         upTriFactor->csrMat = new CsrMatrix;
774         upTriFactor->csrMat->num_rows = A->rmap->n;
775         upTriFactor->csrMat->num_cols = A->cmap->n;
776         upTriFactor->csrMat->num_entries = a->nz;
777 
778         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
779         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
780 
781         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
782         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
783 
784         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
785         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
786 
787         /* set the operation */
788         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
789 
790         /* Create the solve analysis information */
791         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
792         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
793       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
794         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
795                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
796                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
797                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
798                                                &upTriFactor->solveBufferSize));
799         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
800       #endif
801 
802         /* perform the solve analysis */
803         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
804                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
805                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
806                                          upTriFactor->csrMat->column_indices->data().get(),
807                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
808                                          upTriFactor->solveInfo,
809                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
810                                          #else
811                                          upTriFactor->solveInfo));
812                                          #endif
813         PetscCallCUDA(WaitForCUDA());
814         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
815 
816         /* assign the pointer */
817         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
818 
819         /* allocate space for the triangular factor information */
820         PetscCall(PetscNew(&loTriFactor));
821         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822 
823         /* Create the matrix description */
824         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
825         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
826        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
827         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
828        #else
829         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
830        #endif
831         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
832         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
833 
834         /* set the operation */
835         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
836 
837         /* set the matrix */
838         loTriFactor->csrMat = new CsrMatrix;
839         loTriFactor->csrMat->num_rows = A->rmap->n;
840         loTriFactor->csrMat->num_cols = A->cmap->n;
841         loTriFactor->csrMat->num_entries = a->nz;
842 
843         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
844         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
845 
846         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
847         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
848 
849         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
850         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
851 
852         /* Create the solve analysis information */
853         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
854         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
855       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
856         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
857                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
858                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
859                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
860                                                &loTriFactor->solveBufferSize));
861         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
862       #endif
863 
864         /* perform the solve analysis */
865         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
866                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
867                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
868                                          loTriFactor->csrMat->column_indices->data().get(),
869                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
870                                          loTriFactor->solveInfo,
871                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
872                                          #else
873                                          loTriFactor->solveInfo));
874                                          #endif
875         PetscCallCUDA(WaitForCUDA());
876         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
877 
878         /* assign the pointer */
879         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
880 
881         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
882         PetscCallCUDA(cudaFreeHost(AiUp));
883         PetscCallCUDA(cudaFreeHost(AjUp));
884       } else {
885         /* Fill the upper triangular matrix */
886         offset = 0;
887         for (i=0; i<n; i++) {
888           /* set the pointers */
889           v  = aa + ai[i];
890           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
891 
892           /* first, set the diagonal elements */
893           AAUp[offset] = 1.0/v[nz];
894           AALo[offset] = 1.0/v[nz];
895 
896           offset+=1;
897           if (nz>0) {
898             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
899             for (j=offset; j<offset+nz; j++) {
900               AAUp[j] = -AAUp[j];
901               AALo[j] = AAUp[j]/v[nz];
902             }
903             offset+=nz;
904           }
905         }
906         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
909         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
910         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
911       }
912       PetscCallCUDA(cudaFreeHost(AAUp));
913       PetscCallCUDA(cudaFreeHost(AALo));
914     } catch(char *ex) {
915       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
916     }
917   }
918   PetscFunctionReturn(0);
919 }
920 
921 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
922 {
923   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
924   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925   IS                           ip = a->row;
926   PetscBool                    perm_identity;
927   PetscInt                     n = A->rmap->n;
928 
929   PetscFunctionBegin;
930   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
931   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
932   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
933   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
934 
935   A->offloadmask = PETSC_OFFLOAD_BOTH;
936 
937   /* lower triangular indices */
938   PetscCall(ISIdentity(ip,&perm_identity));
939   if (!perm_identity) {
940     IS             iip;
941     const PetscInt *irip,*rip;
942 
943     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
944     PetscCall(ISGetIndices(iip,&irip));
945     PetscCall(ISGetIndices(ip,&rip));
946     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
947     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
948     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
950     PetscCall(ISRestoreIndices(iip,&irip));
951     PetscCall(ISDestroy(&iip));
952     PetscCall(ISRestoreIndices(ip,&rip));
953     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
954   }
955   PetscFunctionReturn(0);
956 }
957 
958 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
959 {
960   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
961   IS             ip = b->row;
962   PetscBool      perm_identity;
963 
964   PetscFunctionBegin;
965   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
966   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
967   B->offloadmask = PETSC_OFFLOAD_CPU;
968   /* determine which version of MatSolve needs to be used. */
969   PetscCall(ISIdentity(ip,&perm_identity));
970   if (perm_identity) {
971     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
972     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
973     B->ops->matsolve = NULL;
974     B->ops->matsolvetranspose = NULL;
975   } else {
976     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
977     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
978     B->ops->matsolve = NULL;
979     B->ops->matsolvetranspose = NULL;
980   }
981 
982   /* get the triangular factors */
983   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
984   PetscFunctionReturn(0);
985 }
986 
987 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
988 {
989   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
990   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
991   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
992   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
993   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
994   cusparseIndexBase_t               indexBase;
995   cusparseMatrixType_t              matrixType;
996   cusparseFillMode_t                fillMode;
997   cusparseDiagType_t                diagType;
998 
999   PetscFunctionBegin;
1000   /* allocate space for the transpose of the lower triangular factor */
1001   PetscCall(PetscNew(&loTriFactorT));
1002   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1003 
1004   /* set the matrix descriptors of the lower triangular factor */
1005   matrixType = cusparseGetMatType(loTriFactor->descr);
1006   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1007   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1008     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1009   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1010 
1011   /* Create the matrix description */
1012   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1013   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1014   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1015   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1016   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1017 
1018   /* set the operation */
1019   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1020 
1021   /* allocate GPU space for the CSC of the lower triangular factor*/
1022   loTriFactorT->csrMat = new CsrMatrix;
1023   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1024   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1025   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1026   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1027   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1028   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1029 
1030   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1031 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1032   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1033                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1034                                                loTriFactor->csrMat->values->data().get(),
1035                                                loTriFactor->csrMat->row_offsets->data().get(),
1036                                                loTriFactor->csrMat->column_indices->data().get(),
1037                                                loTriFactorT->csrMat->values->data().get(),
1038                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1040                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1041   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1042 #endif
1043 
1044   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1045   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047                                   loTriFactor->csrMat->values->data().get(),
1048                                   loTriFactor->csrMat->row_offsets->data().get(),
1049                                   loTriFactor->csrMat->column_indices->data().get(),
1050                                   loTriFactorT->csrMat->values->data().get(),
1051                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1052                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1053                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1054                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1055                                   #else
1056                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1057                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1058                                   #endif
1059   PetscCallCUDA(WaitForCUDA());
1060   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1061 
1062   /* Create the solve analysis information */
1063   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1064   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070                                          &loTriFactorT->solveBufferSize));
1071   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1072 #endif
1073 
1074   /* perform the solve analysis */
1075   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                                    loTriFactorT->csrMat->column_indices->data().get(),
1079                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080                                    loTriFactorT->solveInfo,
1081                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1082                                    #else
1083                                    loTriFactorT->solveInfo));
1084                                    #endif
1085   PetscCallCUDA(WaitForCUDA());
1086   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1087 
1088   /* assign the pointer */
1089   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1090 
1091   /*********************************************/
1092   /* Now the Transpose of the Upper Tri Factor */
1093   /*********************************************/
1094 
1095   /* allocate space for the transpose of the upper triangular factor */
1096   PetscCall(PetscNew(&upTriFactorT));
1097   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1098 
1099   /* set the matrix descriptors of the upper triangular factor */
1100   matrixType = cusparseGetMatType(upTriFactor->descr);
1101   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1102   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1103     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1104   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1105 
1106   /* Create the matrix description */
1107   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1108   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1109   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1110   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1111   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1112 
1113   /* set the operation */
1114   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1115 
1116   /* allocate GPU space for the CSC of the upper triangular factor*/
1117   upTriFactorT->csrMat = new CsrMatrix;
1118   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1119   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1120   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1121   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1122   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1123   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1124 
1125   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1126 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1127   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1128                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1129                                                upTriFactor->csrMat->values->data().get(),
1130                                                upTriFactor->csrMat->row_offsets->data().get(),
1131                                                upTriFactor->csrMat->column_indices->data().get(),
1132                                                upTriFactorT->csrMat->values->data().get(),
1133                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1134                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1135                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1136   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1137 #endif
1138 
1139   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1140   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1141                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1142                                   upTriFactor->csrMat->values->data().get(),
1143                                   upTriFactor->csrMat->row_offsets->data().get(),
1144                                   upTriFactor->csrMat->column_indices->data().get(),
1145                                   upTriFactorT->csrMat->values->data().get(),
1146                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1148                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1149                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1150                                   #else
1151                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1152                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1153                                  #endif
1154 
1155   PetscCallCUDA(WaitForCUDA());
1156   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1157 
1158   /* Create the solve analysis information */
1159   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1160   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1161   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1162   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1163                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1164                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1166                                          &upTriFactorT->solveBufferSize));
1167   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1168   #endif
1169 
1170   /* perform the solve analysis */
1171   /* christ, would it have killed you to put this stuff in a function????????? */
1172   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175                                    upTriFactorT->csrMat->column_indices->data().get(),
1176                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1177                                    upTriFactorT->solveInfo,
1178                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1179                                    #else
1180                                    upTriFactorT->solveInfo));
1181                                    #endif
1182 
1183   PetscCallCUDA(WaitForCUDA());
1184   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1185 
1186   /* assign the pointer */
1187   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 struct PetscScalarToPetscInt
1192 {
1193   __host__ __device__
1194   PetscInt operator()(PetscScalar s)
1195   {
1196     return (PetscInt)PetscRealPart(s);
1197   }
1198 };
1199 
1200 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1201 {
1202   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1204   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205   cusparseStatus_t             stat;
1206   cusparseIndexBase_t          indexBase;
1207 
1208   PetscFunctionBegin;
1209   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1210   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1211   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1212   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1213   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1214   if (A->transupdated) PetscFunctionReturn(0);
1215   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1216   PetscCall(PetscLogGpuTimeBegin());
1217   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1218     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1219   }
1220   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1221     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1222     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1223     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1224     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1225     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1226 
1227     /* set alpha and beta */
1228     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1229     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1230     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1231     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1232     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1233     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1234 
1235     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1236       CsrMatrix *matrixT = new CsrMatrix;
1237       matstructT->mat = matrixT;
1238       matrixT->num_rows = A->cmap->n;
1239       matrixT->num_cols = A->rmap->n;
1240       matrixT->num_entries = a->nz;
1241       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1242       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1243       matrixT->values = new THRUSTARRAY(a->nz);
1244 
1245       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1246       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1247 
1248      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1249       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1250         stat = cusparseCreateCsr(&matstructT->matDescr,
1251                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1252                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1253                                matrixT->values->data().get(),
1254                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1255                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1256       #else
1257         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1258            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1259 
1260            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1261            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1262            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1263         */
1264         if (matrixT->num_entries) {
1265           stat = cusparseCreateCsr(&matstructT->matDescr,
1266                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1267                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1268                                  matrixT->values->data().get(),
1269                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1270                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1271 
1272         } else {
1273           matstructT->matDescr = NULL;
1274           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1275         }
1276       #endif
1277      #endif
1278     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1279    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1280       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1281    #else
1282       CsrMatrix *temp  = new CsrMatrix;
1283       CsrMatrix *tempT = new CsrMatrix;
1284       /* First convert HYB to CSR */
1285       temp->num_rows = A->rmap->n;
1286       temp->num_cols = A->cmap->n;
1287       temp->num_entries = a->nz;
1288       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1290       temp->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_hyb2csr(cusparsestruct->handle,
1293                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1297 
1298       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1299       tempT->num_rows = A->rmap->n;
1300       tempT->num_cols = A->cmap->n;
1301       tempT->num_entries = a->nz;
1302       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1303       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1304       tempT->values = new THRUSTARRAY(a->nz);
1305 
1306       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1307                               temp->num_cols, temp->num_entries,
1308                               temp->values->data().get(),
1309                               temp->row_offsets->data().get(),
1310                               temp->column_indices->data().get(),
1311                               tempT->values->data().get(),
1312                               tempT->column_indices->data().get(),
1313                               tempT->row_offsets->data().get(),
1314                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1315 
1316       /* Last, convert CSC to HYB */
1317       cusparseHybMat_t hybMat;
1318       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1319       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1320         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1321       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1322                               matstructT->descr, tempT->values->data().get(),
1323                               tempT->row_offsets->data().get(),
1324                               tempT->column_indices->data().get(),
1325                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1326 
1327       /* assign the pointer */
1328       matstructT->mat = hybMat;
1329       A->transupdated = PETSC_TRUE;
1330       /* delete temporaries */
1331       if (tempT) {
1332         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1333         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1334         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1335         delete (CsrMatrix*) tempT;
1336       }
1337       if (temp) {
1338         if (temp->values) delete (THRUSTARRAY*) temp->values;
1339         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1340         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1341         delete (CsrMatrix*) temp;
1342       }
1343      #endif
1344     }
1345   }
1346   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1347     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1348     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1349     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1350     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1351     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1352     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1353     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1354     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1355     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1356     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1357     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1358       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1359       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1360       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1361     }
1362     if (!cusparsestruct->csr2csc_i) {
1363       THRUSTARRAY csr2csc_a(matrix->num_entries);
1364       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1365 
1366       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1367      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1368       void   *csr2cscBuffer;
1369       size_t csr2cscBufferSize;
1370       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1371                                            A->cmap->n, matrix->num_entries,
1372                                            matrix->values->data().get(),
1373                                            cusparsestruct->rowoffsets_gpu->data().get(),
1374                                            matrix->column_indices->data().get(),
1375                                            matrixT->values->data().get(),
1376                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1377                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1378                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1379       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1380      #endif
1381 
1382       if (matrix->num_entries) {
1383         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1384            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1385            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1386 
1387            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1388            should be filled with indexBase. So I just take a shortcut here.
1389         */
1390         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1391                               A->cmap->n,matrix->num_entries,
1392                               csr2csc_a.data().get(),
1393                               cusparsestruct->rowoffsets_gpu->data().get(),
1394                               matrix->column_indices->data().get(),
1395                               matrixT->values->data().get(),
1396                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1398                               CUSPARSE_ACTION_NUMERIC,indexBase,
1399                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1400                              #else
1401                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1402                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1403                              #endif
1404       } else {
1405         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1406       }
1407 
1408       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1409       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1410      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1411       PetscCallCUDA(cudaFree(csr2cscBuffer));
1412      #endif
1413     }
1414     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1415                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1416                                                      matrixT->values->begin()));
1417   }
1418   PetscCall(PetscLogGpuTimeEnd());
1419   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1420   /* the compressed row indices is not used for matTranspose */
1421   matstructT->cprowIndices = NULL;
1422   /* assign the pointer */
1423   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1424   A->transupdated = PETSC_TRUE;
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1429 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1430 {
1431   PetscInt                              n = xx->map->n;
1432   const PetscScalar                     *barray;
1433   PetscScalar                           *xarray;
1434   thrust::device_ptr<const PetscScalar> bGPU;
1435   thrust::device_ptr<PetscScalar>       xGPU;
1436   cusparseStatus_t                      stat;
1437   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1438   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1439   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1440   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1441 
1442   PetscFunctionBegin;
1443   /* Analyze the matrix and create the transpose ... on the fly */
1444   if (!loTriFactorT && !upTriFactorT) {
1445     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1446     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1447     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1448   }
1449 
1450   /* Get the GPU pointers */
1451   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1452   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1453   xGPU = thrust::device_pointer_cast(xarray);
1454   bGPU = thrust::device_pointer_cast(barray);
1455 
1456   PetscCall(PetscLogGpuTimeBegin());
1457   /* First, reorder with the row permutation */
1458   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1459                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1460                xGPU);
1461 
1462   /* First, solve U */
1463   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1464                         upTriFactorT->csrMat->num_rows,
1465                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1466                         upTriFactorT->csrMat->num_entries,
1467                       #endif
1468                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1469                         upTriFactorT->csrMat->values->data().get(),
1470                         upTriFactorT->csrMat->row_offsets->data().get(),
1471                         upTriFactorT->csrMat->column_indices->data().get(),
1472                         upTriFactorT->solveInfo,
1473                         xarray,
1474                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475                         tempGPU->data().get(),
1476                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1477                       #else
1478                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1479                       #endif
1480 
1481   /* Then, solve L */
1482   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1483                         loTriFactorT->csrMat->num_rows,
1484                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1485                         loTriFactorT->csrMat->num_entries,
1486                       #endif
1487                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1488                         loTriFactorT->csrMat->values->data().get(),
1489                         loTriFactorT->csrMat->row_offsets->data().get(),
1490                         loTriFactorT->csrMat->column_indices->data().get(),
1491                         loTriFactorT->solveInfo,
1492                         tempGPU->data().get(),
1493                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1494                         xarray,
1495                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1496                       #else
1497                          xarray);PetscCallCUSPARSE(stat);
1498                       #endif
1499 
1500   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1501   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1502                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1503                tempGPU->begin());
1504 
1505   /* Copy the temporary to the full solution. */
1506   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1507 
1508   /* restore */
1509   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1510   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1511   PetscCall(PetscLogGpuTimeEnd());
1512   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1513   PetscFunctionReturn(0);
1514 }
1515 
1516 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1517 {
1518   const PetscScalar                 *barray;
1519   PetscScalar                       *xarray;
1520   cusparseStatus_t                  stat;
1521   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1522   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1523   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1524   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1525 
1526   PetscFunctionBegin;
1527   /* Analyze the matrix and create the transpose ... on the fly */
1528   if (!loTriFactorT && !upTriFactorT) {
1529     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1530     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1531     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1532   }
1533 
1534   /* Get the GPU pointers */
1535   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1536   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1537 
1538   PetscCall(PetscLogGpuTimeBegin());
1539   /* First, solve U */
1540   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1541                         upTriFactorT->csrMat->num_rows,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         upTriFactorT->csrMat->num_entries,
1544                       #endif
1545                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1546                         upTriFactorT->csrMat->values->data().get(),
1547                         upTriFactorT->csrMat->row_offsets->data().get(),
1548                         upTriFactorT->csrMat->column_indices->data().get(),
1549                         upTriFactorT->solveInfo,
1550                         barray,
1551                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552                         tempGPU->data().get(),
1553                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1554                       #else
1555                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1556                       #endif
1557 
1558   /* Then, solve L */
1559   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1560                         loTriFactorT->csrMat->num_rows,
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         loTriFactorT->csrMat->num_entries,
1563                       #endif
1564                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1565                         loTriFactorT->csrMat->values->data().get(),
1566                         loTriFactorT->csrMat->row_offsets->data().get(),
1567                         loTriFactorT->csrMat->column_indices->data().get(),
1568                         loTriFactorT->solveInfo,
1569                         tempGPU->data().get(),
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         xarray,
1572                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1573                       #else
1574                         xarray);PetscCallCUSPARSE(stat);
1575                       #endif
1576 
1577   /* restore */
1578   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1579   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1580   PetscCall(PetscLogGpuTimeEnd());
1581   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1582   PetscFunctionReturn(0);
1583 }
1584 
1585 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1586 {
1587   const PetscScalar                     *barray;
1588   PetscScalar                           *xarray;
1589   thrust::device_ptr<const PetscScalar> bGPU;
1590   thrust::device_ptr<PetscScalar>       xGPU;
1591   cusparseStatus_t                      stat;
1592   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1594   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1595   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1596 
1597   PetscFunctionBegin;
1598 
1599   /* Get the GPU pointers */
1600   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1601   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1602   xGPU = thrust::device_pointer_cast(xarray);
1603   bGPU = thrust::device_pointer_cast(barray);
1604 
1605   PetscCall(PetscLogGpuTimeBegin());
1606   /* First, reorder with the row permutation */
1607   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1608                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1609                tempGPU->begin());
1610 
1611   /* Next, solve L */
1612   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1613                         loTriFactor->csrMat->num_rows,
1614                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615                         loTriFactor->csrMat->num_entries,
1616                       #endif
1617                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1618                         loTriFactor->csrMat->values->data().get(),
1619                         loTriFactor->csrMat->row_offsets->data().get(),
1620                         loTriFactor->csrMat->column_indices->data().get(),
1621                         loTriFactor->solveInfo,
1622                         tempGPU->data().get(),
1623                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624                          xarray,
1625                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1626                       #else
1627                          xarray);PetscCallCUSPARSE(stat);
1628                       #endif
1629 
1630   /* Then, solve U */
1631   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1632                         upTriFactor->csrMat->num_rows,
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         upTriFactor->csrMat->num_entries,
1635                       #endif
1636                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1637                         upTriFactor->csrMat->values->data().get(),
1638                         upTriFactor->csrMat->row_offsets->data().get(),
1639                         upTriFactor->csrMat->column_indices->data().get(),
1640                         upTriFactor->solveInfo,xarray,
1641                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1642                         tempGPU->data().get(),
1643                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1644                       #else
1645                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1646                       #endif
1647 
1648   /* Last, reorder with the column permutation */
1649   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1650                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1651                xGPU);
1652 
1653   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1654   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1655   PetscCall(PetscLogGpuTimeEnd());
1656   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1657   PetscFunctionReturn(0);
1658 }
1659 
1660 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1661 {
1662   const PetscScalar                 *barray;
1663   PetscScalar                       *xarray;
1664   cusparseStatus_t                  stat;
1665   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1667   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1668   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1669 
1670   PetscFunctionBegin;
1671   /* Get the GPU pointers */
1672   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1673   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1674 
1675   PetscCall(PetscLogGpuTimeBegin());
1676   /* First, solve L */
1677   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1678                         loTriFactor->csrMat->num_rows,
1679                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1680                         loTriFactor->csrMat->num_entries,
1681                       #endif
1682                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1683                         loTriFactor->csrMat->values->data().get(),
1684                         loTriFactor->csrMat->row_offsets->data().get(),
1685                         loTriFactor->csrMat->column_indices->data().get(),
1686                         loTriFactor->solveInfo,
1687                         barray,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         tempGPU->data().get(),
1690                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1691                       #else
1692                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1693                       #endif
1694 
1695   /* Next, solve U */
1696   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1697                         upTriFactor->csrMat->num_rows,
1698                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1699                         upTriFactor->csrMat->num_entries,
1700                       #endif
1701                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1702                         upTriFactor->csrMat->values->data().get(),
1703                         upTriFactor->csrMat->row_offsets->data().get(),
1704                         upTriFactor->csrMat->column_indices->data().get(),
1705                         upTriFactor->solveInfo,
1706                         tempGPU->data().get(),
1707                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1708                         xarray,
1709                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1710                       #else
1711                         xarray);PetscCallCUSPARSE(stat);
1712                       #endif
1713 
1714   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1715   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1716   PetscCall(PetscLogGpuTimeEnd());
1717   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1718   PetscFunctionReturn(0);
1719 }
1720 
1721 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1722 {
1723   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1724   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1725 
1726   PetscFunctionBegin;
1727   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1728     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1729 
1730     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1731     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1732     PetscCallCUDA(WaitForCUDA());
1733     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1734     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1735     A->offloadmask = PETSC_OFFLOAD_BOTH;
1736   }
1737   PetscFunctionReturn(0);
1738 }
1739 
1740 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1741 {
1742   PetscFunctionBegin;
1743   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1744   *array = ((Mat_SeqAIJ*)A->data)->a;
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1749 {
1750   PetscFunctionBegin;
1751   A->offloadmask = PETSC_OFFLOAD_CPU;
1752   *array         = NULL;
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1757 {
1758   PetscFunctionBegin;
1759   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1760   *array = ((Mat_SeqAIJ*)A->data)->a;
1761   PetscFunctionReturn(0);
1762 }
1763 
1764 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1765 {
1766   PetscFunctionBegin;
1767   *array = NULL;
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1772 {
1773   PetscFunctionBegin;
1774   *array = ((Mat_SeqAIJ*)A->data)->a;
1775   PetscFunctionReturn(0);
1776 }
1777 
1778 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1779 {
1780   PetscFunctionBegin;
1781   A->offloadmask = PETSC_OFFLOAD_CPU;
1782   *array         = NULL;
1783   PetscFunctionReturn(0);
1784 }
1785 
1786 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1787 {
1788   Mat_SeqAIJCUSPARSE           *cusp;
1789   CsrMatrix                    *matrix;
1790 
1791   PetscFunctionBegin;
1792   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1793   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1794   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1795   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
1796   matrix = (CsrMatrix*)cusp->mat->mat;
1797 
1798   if (i) {
1799    #if !defined(PETSC_USE_64BIT_INDICES)
1800     *i = matrix->row_offsets->data().get();
1801    #else
1802     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1803    #endif
1804   }
1805   if (j) {
1806    #if !defined(PETSC_USE_64BIT_INDICES)
1807     *j = matrix->column_indices->data().get();
1808    #else
1809     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1810    #endif
1811   }
1812   if (a) *a = matrix->values->data().get();
1813   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1818 {
1819   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1820   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1821   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1822   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1823   cusparseStatus_t             stat;
1824   PetscBool                    both = PETSC_TRUE;
1825 
1826   PetscFunctionBegin;
1827   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1828   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1829     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1830       CsrMatrix *matrix;
1831       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1832 
1833       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1834       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1835       matrix->values->assign(a->a, a->a+a->nz);
1836       PetscCallCUDA(WaitForCUDA());
1837       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1838       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1839       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1840     } else {
1841       PetscInt nnz;
1842       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1843       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1844       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1845       delete cusparsestruct->workVector;
1846       delete cusparsestruct->rowoffsets_gpu;
1847       cusparsestruct->workVector = NULL;
1848       cusparsestruct->rowoffsets_gpu = NULL;
1849       try {
1850         if (a->compressedrow.use) {
1851           m    = a->compressedrow.nrows;
1852           ii   = a->compressedrow.i;
1853           ridx = a->compressedrow.rindex;
1854         } else {
1855           m    = A->rmap->n;
1856           ii   = a->i;
1857           ridx = NULL;
1858         }
1859         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1860         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1861         else nnz = a->nz;
1862         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1863 
1864         /* create cusparse matrix */
1865         cusparsestruct->nrows = m;
1866         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1867         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1868         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1869         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1870 
1871         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1872         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1873         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1874         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1875         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1876         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1877         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1878 
1879         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1880         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1881           /* set the matrix */
1882           CsrMatrix *mat= new CsrMatrix;
1883           mat->num_rows = m;
1884           mat->num_cols = A->cmap->n;
1885           mat->num_entries = nnz;
1886           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1887           mat->row_offsets->assign(ii, ii + m+1);
1888 
1889           mat->column_indices = new THRUSTINTARRAY32(nnz);
1890           mat->column_indices->assign(a->j, a->j+nnz);
1891 
1892           mat->values = new THRUSTARRAY(nnz);
1893           if (a->a) mat->values->assign(a->a, a->a+nnz);
1894 
1895           /* assign the pointer */
1896           matstruct->mat = mat;
1897          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1899             stat = cusparseCreateCsr(&matstruct->matDescr,
1900                                     mat->num_rows, mat->num_cols, mat->num_entries,
1901                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1902                                     mat->values->data().get(),
1903                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1904                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1905           }
1906          #endif
1907         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1908          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1909           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1910          #else
1911           CsrMatrix *mat= new CsrMatrix;
1912           mat->num_rows = m;
1913           mat->num_cols = A->cmap->n;
1914           mat->num_entries = nnz;
1915           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1916           mat->row_offsets->assign(ii, ii + m+1);
1917 
1918           mat->column_indices = new THRUSTINTARRAY32(nnz);
1919           mat->column_indices->assign(a->j, a->j+nnz);
1920 
1921           mat->values = new THRUSTARRAY(nnz);
1922           if (a->a) mat->values->assign(a->a, a->a+nnz);
1923 
1924           cusparseHybMat_t hybMat;
1925           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1926           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1927             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1928           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1929               matstruct->descr, mat->values->data().get(),
1930               mat->row_offsets->data().get(),
1931               mat->column_indices->data().get(),
1932               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1933           /* assign the pointer */
1934           matstruct->mat = hybMat;
1935 
1936           if (mat) {
1937             if (mat->values) delete (THRUSTARRAY*)mat->values;
1938             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1939             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1940             delete (CsrMatrix*)mat;
1941           }
1942          #endif
1943         }
1944 
1945         /* assign the compressed row indices */
1946         if (a->compressedrow.use) {
1947           cusparsestruct->workVector = new THRUSTARRAY(m);
1948           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1949           matstruct->cprowIndices->assign(ridx,ridx+m);
1950           tmp = m;
1951         } else {
1952           cusparsestruct->workVector = NULL;
1953           matstruct->cprowIndices    = NULL;
1954           tmp = 0;
1955         }
1956         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1957 
1958         /* assign the pointer */
1959         cusparsestruct->mat = matstruct;
1960       } catch(char *ex) {
1961         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1962       }
1963       PetscCallCUDA(WaitForCUDA());
1964       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1965       cusparsestruct->nonzerostate = A->nonzerostate;
1966     }
1967     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1968   }
1969   PetscFunctionReturn(0);
1970 }
1971 
1972 struct VecCUDAPlusEquals
1973 {
1974   template <typename Tuple>
1975   __host__ __device__
1976   void operator()(Tuple t)
1977   {
1978     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1979   }
1980 };
1981 
1982 struct VecCUDAEquals
1983 {
1984   template <typename Tuple>
1985   __host__ __device__
1986   void operator()(Tuple t)
1987   {
1988     thrust::get<1>(t) = thrust::get<0>(t);
1989   }
1990 };
1991 
1992 struct VecCUDAEqualsReverse
1993 {
1994   template <typename Tuple>
1995   __host__ __device__
1996   void operator()(Tuple t)
1997   {
1998     thrust::get<0>(t) = thrust::get<1>(t);
1999   }
2000 };
2001 
2002 struct MatMatCusparse {
2003   PetscBool             cisdense;
2004   PetscScalar           *Bt;
2005   Mat                   X;
2006   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2007   PetscLogDouble        flops;
2008   CsrMatrix             *Bcsr;
2009 
2010 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2011   cusparseSpMatDescr_t  matSpBDescr;
2012   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2013   cusparseDnMatDescr_t  matBDescr;
2014   cusparseDnMatDescr_t  matCDescr;
2015   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2016  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2017   void                  *dBuffer4;
2018   void                  *dBuffer5;
2019  #endif
2020   size_t                mmBufferSize;
2021   void                  *mmBuffer;
2022   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2023   cusparseSpGEMMDescr_t spgemmDesc;
2024 #endif
2025 };
2026 
2027 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2028 {
2029   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2030 
2031   PetscFunctionBegin;
2032   PetscCallCUDA(cudaFree(mmdata->Bt));
2033   delete mmdata->Bcsr;
2034  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2035   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2036   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2037   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2038   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2039  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2040   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2041   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2042  #endif
2043   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2044   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2045  #endif
2046   PetscCall(MatDestroy(&mmdata->X));
2047   PetscCall(PetscFree(data));
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2052 
2053 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2054 {
2055   Mat_Product                  *product = C->product;
2056   Mat                          A,B;
2057   PetscInt                     m,n,blda,clda;
2058   PetscBool                    flg,biscuda;
2059   Mat_SeqAIJCUSPARSE           *cusp;
2060   cusparseStatus_t             stat;
2061   cusparseOperation_t          opA;
2062   const PetscScalar            *barray;
2063   PetscScalar                  *carray;
2064   MatMatCusparse               *mmdata;
2065   Mat_SeqAIJCUSPARSEMultStruct *mat;
2066   CsrMatrix                    *csrmat;
2067 
2068   PetscFunctionBegin;
2069   MatCheckProduct(C,1);
2070   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2071   mmdata = (MatMatCusparse*)product->data;
2072   A    = product->A;
2073   B    = product->B;
2074   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2075   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2076   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2077      Instead of silently accepting the wrong answer, I prefer to raise the error */
2078   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2079   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2080   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2081   switch (product->type) {
2082   case MATPRODUCT_AB:
2083   case MATPRODUCT_PtAP:
2084     mat = cusp->mat;
2085     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2086     m   = A->rmap->n;
2087     n   = B->cmap->n;
2088     break;
2089   case MATPRODUCT_AtB:
2090     if (!A->form_explicit_transpose) {
2091       mat = cusp->mat;
2092       opA = CUSPARSE_OPERATION_TRANSPOSE;
2093     } else {
2094       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2095       mat  = cusp->matTranspose;
2096       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2097     }
2098     m = A->cmap->n;
2099     n = B->cmap->n;
2100     break;
2101   case MATPRODUCT_ABt:
2102   case MATPRODUCT_RARt:
2103     mat = cusp->mat;
2104     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105     m   = A->rmap->n;
2106     n   = B->rmap->n;
2107     break;
2108   default:
2109     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2110   }
2111   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2112   csrmat = (CsrMatrix*)mat->mat;
2113   /* if the user passed a CPU matrix, copy the data to the GPU */
2114   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2115   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2116   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2117 
2118   PetscCall(MatDenseGetLDA(B,&blda));
2119   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2120     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2121     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2122   } else {
2123     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2124     PetscCall(MatDenseGetLDA(C,&clda));
2125   }
2126 
2127   PetscCall(PetscLogGpuTimeBegin());
2128  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2129   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2130   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2131   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2132     size_t mmBufferSize;
2133     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2134     if (!mmdata->matBDescr) {
2135       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2136       mmdata->Blda = blda;
2137     }
2138 
2139     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2140     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2141       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2142       mmdata->Clda = clda;
2143     }
2144 
2145     if (!mat->matDescr) {
2146       stat = cusparseCreateCsr(&mat->matDescr,
2147                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2148                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2149                                csrmat->values->data().get(),
2150                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2151                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2152     }
2153     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2154                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2155                                    mmdata->matCDescr,cusparse_scalartype,
2156                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2157     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2158       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2159       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2160       mmdata->mmBufferSize = mmBufferSize;
2161     }
2162     mmdata->initialized = PETSC_TRUE;
2163   } else {
2164     /* to be safe, always update pointers of the mats */
2165     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2166     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2167     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2168   }
2169 
2170   /* do cusparseSpMM, which supports transpose on B */
2171   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2172                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2173                       mmdata->matCDescr,cusparse_scalartype,
2174                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2175  #else
2176   PetscInt k;
2177   /* cusparseXcsrmm does not support transpose on B */
2178   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2179     cublasHandle_t cublasv2handle;
2180     cublasStatus_t cerr;
2181 
2182     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2183     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2184                        B->cmap->n,B->rmap->n,
2185                        &PETSC_CUSPARSE_ONE ,barray,blda,
2186                        &PETSC_CUSPARSE_ZERO,barray,blda,
2187                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2188     blda = B->cmap->n;
2189     k    = B->cmap->n;
2190   } else {
2191     k    = B->rmap->n;
2192   }
2193 
2194   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2195   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2196                            csrmat->num_entries,mat->alpha_one,mat->descr,
2197                            csrmat->values->data().get(),
2198                            csrmat->row_offsets->data().get(),
2199                            csrmat->column_indices->data().get(),
2200                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2201                            carray,clda);PetscCallCUSPARSE(stat);
2202  #endif
2203   PetscCall(PetscLogGpuTimeEnd());
2204   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2205   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2206   if (product->type == MATPRODUCT_RARt) {
2207     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2208     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2209   } else if (product->type == MATPRODUCT_PtAP) {
2210     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2211     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2212   } else {
2213     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2214   }
2215   if (mmdata->cisdense) {
2216     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2217   }
2218   if (!biscuda) {
2219     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2220   }
2221   PetscFunctionReturn(0);
2222 }
2223 
2224 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2225 {
2226   Mat_Product        *product = C->product;
2227   Mat                A,B;
2228   PetscInt           m,n;
2229   PetscBool          cisdense,flg;
2230   MatMatCusparse     *mmdata;
2231   Mat_SeqAIJCUSPARSE *cusp;
2232 
2233   PetscFunctionBegin;
2234   MatCheckProduct(C,1);
2235   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2236   A    = product->A;
2237   B    = product->B;
2238   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2239   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2240   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2241   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2242   switch (product->type) {
2243   case MATPRODUCT_AB:
2244     m = A->rmap->n;
2245     n = B->cmap->n;
2246     break;
2247   case MATPRODUCT_AtB:
2248     m = A->cmap->n;
2249     n = B->cmap->n;
2250     break;
2251   case MATPRODUCT_ABt:
2252     m = A->rmap->n;
2253     n = B->rmap->n;
2254     break;
2255   case MATPRODUCT_PtAP:
2256     m = B->cmap->n;
2257     n = B->cmap->n;
2258     break;
2259   case MATPRODUCT_RARt:
2260     m = B->rmap->n;
2261     n = B->rmap->n;
2262     break;
2263   default:
2264     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2265   }
2266   PetscCall(MatSetSizes(C,m,n,m,n));
2267   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2268   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2269   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2270 
2271   /* product data */
2272   PetscCall(PetscNew(&mmdata));
2273   mmdata->cisdense = cisdense;
2274  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2275   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2276   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2277     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2278   }
2279  #endif
2280   /* for these products we need intermediate storage */
2281   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2282     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2283     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2284     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2285       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2286     } else {
2287       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2288     }
2289   }
2290   C->product->data    = mmdata;
2291   C->product->destroy = MatDestroy_MatMatCusparse;
2292 
2293   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2294   PetscFunctionReturn(0);
2295 }
2296 
2297 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2298 {
2299   Mat_Product                  *product = C->product;
2300   Mat                          A,B;
2301   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2302   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2303   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2304   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2305   PetscBool                    flg;
2306   cusparseStatus_t             stat;
2307   MatProductType               ptype;
2308   MatMatCusparse               *mmdata;
2309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2310   cusparseSpMatDescr_t         BmatSpDescr;
2311 #endif
2312   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2313 
2314   PetscFunctionBegin;
2315   MatCheckProduct(C,1);
2316   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2317   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2318   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2319   mmdata = (MatMatCusparse*)C->product->data;
2320   A = product->A;
2321   B = product->B;
2322   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2323     mmdata->reusesym = PETSC_FALSE;
2324     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2325     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2326     Cmat = Ccusp->mat;
2327     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2328     Ccsr = (CsrMatrix*)Cmat->mat;
2329     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2330     goto finalize;
2331   }
2332   if (!c->nz) goto finalize;
2333   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2334   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2335   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2336   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2337   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2338   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2339   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2340   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2341   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2342   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2343   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2344   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2345   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2346   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2347 
2348   ptype = product->type;
2349   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2350     ptype = MATPRODUCT_AB;
2351     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2352   }
2353   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2354     ptype = MATPRODUCT_AB;
2355     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2356   }
2357   switch (ptype) {
2358   case MATPRODUCT_AB:
2359     Amat = Acusp->mat;
2360     Bmat = Bcusp->mat;
2361     break;
2362   case MATPRODUCT_AtB:
2363     Amat = Acusp->matTranspose;
2364     Bmat = Bcusp->mat;
2365     break;
2366   case MATPRODUCT_ABt:
2367     Amat = Acusp->mat;
2368     Bmat = Bcusp->matTranspose;
2369     break;
2370   default:
2371     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2372   }
2373   Cmat = Ccusp->mat;
2374   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2375   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2376   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2377   Acsr = (CsrMatrix*)Amat->mat;
2378   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2379   Ccsr = (CsrMatrix*)Cmat->mat;
2380   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2381   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2382   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2383   PetscCall(PetscLogGpuTimeBegin());
2384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2385   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2386   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2387   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2388     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2389                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2390                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2391                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2392   #else
2393     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2394                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2395                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2396                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2397     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2398                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2399                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2400   #endif
2401 #else
2402   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2403                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2404                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2405                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2406                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2407 #endif
2408   PetscCall(PetscLogGpuFlops(mmdata->flops));
2409   PetscCallCUDA(WaitForCUDA());
2410   PetscCall(PetscLogGpuTimeEnd());
2411   C->offloadmask = PETSC_OFFLOAD_GPU;
2412 finalize:
2413   /* shorter version of MatAssemblyEnd_SeqAIJ */
2414   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2415   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2416   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2417   c->reallocs         = 0;
2418   C->info.mallocs    += 0;
2419   C->info.nz_unneeded = 0;
2420   C->assembled = C->was_assembled = PETSC_TRUE;
2421   C->num_ass++;
2422   PetscFunctionReturn(0);
2423 }
2424 
2425 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2426 {
2427   Mat_Product                  *product = C->product;
2428   Mat                          A,B;
2429   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2430   Mat_SeqAIJ                   *a,*b,*c;
2431   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2432   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2433   PetscInt                     i,j,m,n,k;
2434   PetscBool                    flg;
2435   cusparseStatus_t             stat;
2436   MatProductType               ptype;
2437   MatMatCusparse               *mmdata;
2438   PetscLogDouble               flops;
2439   PetscBool                    biscompressed,ciscompressed;
2440 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2441   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2442   cusparseSpMatDescr_t         BmatSpDescr;
2443 #else
2444   int                          cnz;
2445 #endif
2446   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2447 
2448   PetscFunctionBegin;
2449   MatCheckProduct(C,1);
2450   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2451   A    = product->A;
2452   B    = product->B;
2453   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2454   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2455   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2456   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2457   a = (Mat_SeqAIJ*)A->data;
2458   b = (Mat_SeqAIJ*)B->data;
2459   /* product data */
2460   PetscCall(PetscNew(&mmdata));
2461   C->product->data    = mmdata;
2462   C->product->destroy = MatDestroy_MatMatCusparse;
2463 
2464   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2465   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2466   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2467   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2468   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2469   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2470 
2471   ptype = product->type;
2472   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2473     ptype = MATPRODUCT_AB;
2474     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2475   }
2476   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2477     ptype = MATPRODUCT_AB;
2478     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2479   }
2480   biscompressed = PETSC_FALSE;
2481   ciscompressed = PETSC_FALSE;
2482   switch (ptype) {
2483   case MATPRODUCT_AB:
2484     m = A->rmap->n;
2485     n = B->cmap->n;
2486     k = A->cmap->n;
2487     Amat = Acusp->mat;
2488     Bmat = Bcusp->mat;
2489     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2490     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2491     break;
2492   case MATPRODUCT_AtB:
2493     m = A->cmap->n;
2494     n = B->cmap->n;
2495     k = A->rmap->n;
2496     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2497     Amat = Acusp->matTranspose;
2498     Bmat = Bcusp->mat;
2499     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2500     break;
2501   case MATPRODUCT_ABt:
2502     m = A->rmap->n;
2503     n = B->rmap->n;
2504     k = A->cmap->n;
2505     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2506     Amat = Acusp->mat;
2507     Bmat = Bcusp->matTranspose;
2508     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2509     break;
2510   default:
2511     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2512   }
2513 
2514   /* create cusparse matrix */
2515   PetscCall(MatSetSizes(C,m,n,m,n));
2516   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2517   c     = (Mat_SeqAIJ*)C->data;
2518   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2519   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2520   Ccsr  = new CsrMatrix;
2521 
2522   c->compressedrow.use = ciscompressed;
2523   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2524     c->compressedrow.nrows = a->compressedrow.nrows;
2525     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2526     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2527     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2528     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2529     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2530   } else {
2531     c->compressedrow.nrows  = 0;
2532     c->compressedrow.i      = NULL;
2533     c->compressedrow.rindex = NULL;
2534     Ccusp->workVector       = NULL;
2535     Cmat->cprowIndices      = NULL;
2536   }
2537   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2538   Ccusp->mat      = Cmat;
2539   Ccusp->mat->mat = Ccsr;
2540   Ccsr->num_rows    = Ccusp->nrows;
2541   Ccsr->num_cols    = n;
2542   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2543   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2544   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2545   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2546   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2547   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2548   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2549   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2550   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2551   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2552   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2553     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2554     c->nz = 0;
2555     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2556     Ccsr->values = new THRUSTARRAY(c->nz);
2557     goto finalizesym;
2558   }
2559 
2560   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2561   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2562   Acsr = (CsrMatrix*)Amat->mat;
2563   if (!biscompressed) {
2564     Bcsr = (CsrMatrix*)Bmat->mat;
2565 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2566     BmatSpDescr = Bmat->matDescr;
2567 #endif
2568   } else { /* we need to use row offsets for the full matrix */
2569     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2570     Bcsr = new CsrMatrix;
2571     Bcsr->num_rows       = B->rmap->n;
2572     Bcsr->num_cols       = cBcsr->num_cols;
2573     Bcsr->num_entries    = cBcsr->num_entries;
2574     Bcsr->column_indices = cBcsr->column_indices;
2575     Bcsr->values         = cBcsr->values;
2576     if (!Bcusp->rowoffsets_gpu) {
2577       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2578       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2579       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2580     }
2581     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2582     mmdata->Bcsr = Bcsr;
2583 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2584     if (Bcsr->num_rows && Bcsr->num_cols) {
2585       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2586                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2587                                Bcsr->values->data().get(),
2588                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2589                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2590     }
2591     BmatSpDescr = mmdata->matSpBDescr;
2592 #endif
2593   }
2594   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2595   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2596   /* precompute flops count */
2597   if (ptype == MATPRODUCT_AB) {
2598     for (i=0, flops = 0; i<A->rmap->n; i++) {
2599       const PetscInt st = a->i[i];
2600       const PetscInt en = a->i[i+1];
2601       for (j=st; j<en; j++) {
2602         const PetscInt brow = a->j[j];
2603         flops += 2.*(b->i[brow+1] - b->i[brow]);
2604       }
2605     }
2606   } else if (ptype == MATPRODUCT_AtB) {
2607     for (i=0, flops = 0; i<A->rmap->n; i++) {
2608       const PetscInt anzi = a->i[i+1] - a->i[i];
2609       const PetscInt bnzi = b->i[i+1] - b->i[i];
2610       flops += (2.*anzi)*bnzi;
2611     }
2612   } else { /* TODO */
2613     flops = 0.;
2614   }
2615 
2616   mmdata->flops = flops;
2617   PetscCall(PetscLogGpuTimeBegin());
2618 
2619 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2620   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2621   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2622                           NULL, NULL, NULL,
2623                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2624                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2625   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2626  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2627  {
2628   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2629      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2630   */
2631   void*  dBuffer1 = NULL;
2632   void*  dBuffer2 = NULL;
2633   void*  dBuffer3 = NULL;
2634   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2635   size_t bufferSize1 = 0;
2636   size_t bufferSize2 = 0;
2637   size_t bufferSize3 = 0;
2638   size_t bufferSize4 = 0;
2639   size_t bufferSize5 = 0;
2640 
2641   /*----------------------------------------------------------------------*/
2642   /* ask bufferSize1 bytes for external memory */
2643   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2644                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2645                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2646   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2647   /* inspect the matrices A and B to understand the memory requirement for the next step */
2648   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2649                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2650                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2651 
2652   /*----------------------------------------------------------------------*/
2653   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2654                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2655                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2656   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2657   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2658   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2659   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2660                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2661                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2662   PetscCallCUDA(cudaFree(dBuffer1));
2663   PetscCallCUDA(cudaFree(dBuffer2));
2664 
2665   /*----------------------------------------------------------------------*/
2666   /* get matrix C non-zero entries C_nnz1 */
2667   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2668   c->nz = (PetscInt) C_nnz1;
2669   /* allocate matrix C */
2670   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2671   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2672   /* update matC with the new pointers */
2673   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2674                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2675 
2676   /*----------------------------------------------------------------------*/
2677   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2678                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2679                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2680   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2681   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2682                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2683                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2684   PetscCallCUDA(cudaFree(dBuffer3));
2685   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2686                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2687                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2688                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2689   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2690  }
2691  #else
2692   size_t bufSize2;
2693   /* ask bufferSize bytes for external memory */
2694   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2695                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2696                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2697                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2698   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2699   /* inspect the matrices A and B to understand the memory requirement for the next step */
2700   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2701                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2702                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2703                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2704   /* ask bufferSize again bytes for external memory */
2705   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2706                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2707                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2708                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2709   /* The CUSPARSE documentation is not clear, nor the API
2710      We need both buffers to perform the operations properly!
2711      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2712      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2713      is stored in the descriptor! What a messy API... */
2714   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2715   /* compute the intermediate product of A * B */
2716   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2717                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2718                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2719                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2720   /* get matrix C non-zero entries C_nnz1 */
2721   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2722   c->nz = (PetscInt) C_nnz1;
2723   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2724   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2725   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2726   Ccsr->values = new THRUSTARRAY(c->nz);
2727   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2728   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2729                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2730   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2731                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2732                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2733  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2734 #else
2735   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2736   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2737                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2738                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2739                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2740                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2741   c->nz = cnz;
2742   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2743   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2744   Ccsr->values = new THRUSTARRAY(c->nz);
2745   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2746 
2747   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2748   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2749      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2750      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2751   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2752                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2753                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2754                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2755                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2756 #endif
2757   PetscCall(PetscLogGpuFlops(mmdata->flops));
2758   PetscCall(PetscLogGpuTimeEnd());
2759 finalizesym:
2760   c->singlemalloc = PETSC_FALSE;
2761   c->free_a       = PETSC_TRUE;
2762   c->free_ij      = PETSC_TRUE;
2763   PetscCall(PetscMalloc1(m+1,&c->i));
2764   PetscCall(PetscMalloc1(c->nz,&c->j));
2765   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2766     PetscInt *d_i = c->i;
2767     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2768     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2769     ii   = *Ccsr->row_offsets;
2770     jj   = *Ccsr->column_indices;
2771     if (ciscompressed) d_i = c->compressedrow.i;
2772     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2773     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2774   } else {
2775     PetscInt *d_i = c->i;
2776     if (ciscompressed) d_i = c->compressedrow.i;
2777     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2778     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2779   }
2780   if (ciscompressed) { /* need to expand host row offsets */
2781     PetscInt r = 0;
2782     c->i[0] = 0;
2783     for (k = 0; k < c->compressedrow.nrows; k++) {
2784       const PetscInt next = c->compressedrow.rindex[k];
2785       const PetscInt old = c->compressedrow.i[k];
2786       for (; r < next; r++) c->i[r+1] = old;
2787     }
2788     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2789   }
2790   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2791   PetscCall(PetscMalloc1(m,&c->ilen));
2792   PetscCall(PetscMalloc1(m,&c->imax));
2793   c->maxnz = c->nz;
2794   c->nonzerorowcnt = 0;
2795   c->rmax = 0;
2796   for (k = 0; k < m; k++) {
2797     const PetscInt nn = c->i[k+1] - c->i[k];
2798     c->ilen[k] = c->imax[k] = nn;
2799     c->nonzerorowcnt += (PetscInt)!!nn;
2800     c->rmax = PetscMax(c->rmax,nn);
2801   }
2802   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2803   PetscCall(PetscMalloc1(c->nz,&c->a));
2804   Ccsr->num_entries = c->nz;
2805 
2806   C->nonzerostate++;
2807   PetscCall(PetscLayoutSetUp(C->rmap));
2808   PetscCall(PetscLayoutSetUp(C->cmap));
2809   Ccusp->nonzerostate = C->nonzerostate;
2810   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2811   C->preallocated  = PETSC_TRUE;
2812   C->assembled     = PETSC_FALSE;
2813   C->was_assembled = PETSC_FALSE;
2814   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2815     mmdata->reusesym = PETSC_TRUE;
2816     C->offloadmask   = PETSC_OFFLOAD_GPU;
2817   }
2818   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2819   PetscFunctionReturn(0);
2820 }
2821 
2822 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2823 
2824 /* handles sparse or dense B */
2825 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2826 {
2827   Mat_Product    *product = mat->product;
2828   PetscErrorCode ierr;
2829   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2830 
2831   PetscFunctionBegin;
2832   MatCheckProduct(mat,1);
2833   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2834   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2835     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2836   }
2837   if (product->type == MATPRODUCT_ABC) {
2838     Ciscusp = PETSC_FALSE;
2839     if (!product->C->boundtocpu) {
2840       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2841     }
2842   }
2843   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2844     PetscBool usecpu = PETSC_FALSE;
2845     switch (product->type) {
2846     case MATPRODUCT_AB:
2847       if (product->api_user) {
2848         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr);
2849         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2850         ierr = PetscOptionsEnd();PetscCall(ierr);
2851       } else {
2852         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr);
2853         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2854         ierr = PetscOptionsEnd();PetscCall(ierr);
2855       }
2856       break;
2857     case MATPRODUCT_AtB:
2858       if (product->api_user) {
2859         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr);
2860         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2861         ierr = PetscOptionsEnd();PetscCall(ierr);
2862       } else {
2863         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr);
2864         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2865         ierr = PetscOptionsEnd();PetscCall(ierr);
2866       }
2867       break;
2868     case MATPRODUCT_PtAP:
2869       if (product->api_user) {
2870         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr);
2871         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2872         ierr = PetscOptionsEnd();PetscCall(ierr);
2873       } else {
2874         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr);
2875         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2876         ierr = PetscOptionsEnd();PetscCall(ierr);
2877       }
2878       break;
2879     case MATPRODUCT_RARt:
2880       if (product->api_user) {
2881         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr);
2882         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2883         ierr = PetscOptionsEnd();PetscCall(ierr);
2884       } else {
2885         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr);
2886         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2887         ierr = PetscOptionsEnd();PetscCall(ierr);
2888       }
2889       break;
2890     case MATPRODUCT_ABC:
2891       if (product->api_user) {
2892         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr);
2893         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2894         ierr = PetscOptionsEnd();PetscCall(ierr);
2895       } else {
2896         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr);
2897         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2898         ierr = PetscOptionsEnd();PetscCall(ierr);
2899       }
2900       break;
2901     default:
2902       break;
2903     }
2904     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2905   }
2906   /* dispatch */
2907   if (isdense) {
2908     switch (product->type) {
2909     case MATPRODUCT_AB:
2910     case MATPRODUCT_AtB:
2911     case MATPRODUCT_ABt:
2912     case MATPRODUCT_PtAP:
2913     case MATPRODUCT_RARt:
2914      if (product->A->boundtocpu) {
2915         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2916       } else {
2917         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2918       }
2919       break;
2920     case MATPRODUCT_ABC:
2921       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2922       break;
2923     default:
2924       break;
2925     }
2926   } else if (Biscusp && Ciscusp) {
2927     switch (product->type) {
2928     case MATPRODUCT_AB:
2929     case MATPRODUCT_AtB:
2930     case MATPRODUCT_ABt:
2931       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2932       break;
2933     case MATPRODUCT_PtAP:
2934     case MATPRODUCT_RARt:
2935     case MATPRODUCT_ABC:
2936       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2937       break;
2938     default:
2939       break;
2940     }
2941   } else { /* fallback for AIJ */
2942     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2943   }
2944   PetscFunctionReturn(0);
2945 }
2946 
2947 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2948 {
2949   PetscFunctionBegin;
2950   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2951   PetscFunctionReturn(0);
2952 }
2953 
2954 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2955 {
2956   PetscFunctionBegin;
2957   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2958   PetscFunctionReturn(0);
2959 }
2960 
2961 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2962 {
2963   PetscFunctionBegin;
2964   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2965   PetscFunctionReturn(0);
2966 }
2967 
2968 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2969 {
2970   PetscFunctionBegin;
2971   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2972   PetscFunctionReturn(0);
2973 }
2974 
2975 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2976 {
2977   PetscFunctionBegin;
2978   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2979   PetscFunctionReturn(0);
2980 }
2981 
2982 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2983 {
2984   int i = blockIdx.x*blockDim.x + threadIdx.x;
2985   if (i < n) y[idx[i]] += x[i];
2986 }
2987 
2988 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2989 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2990 {
2991   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2992   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2993   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2994   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2995   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2996   PetscBool                    compressed;
2997 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2998   PetscInt                     nx,ny;
2999 #endif
3000 
3001   PetscFunctionBegin;
3002   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3003   if (!a->nz) {
3004     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3005     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3006     PetscFunctionReturn(0);
3007   }
3008   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3009   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3010   if (!trans) {
3011     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3012     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3013   } else {
3014     if (herm || !A->form_explicit_transpose) {
3015       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3016       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3017     } else {
3018       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3019       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3020     }
3021   }
3022   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3023   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3024 
3025   try {
3026     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3027     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3028     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3029 
3030     PetscCall(PetscLogGpuTimeBegin());
3031     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3032       /* z = A x + beta y.
3033          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3034          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3035       */
3036       xptr = xarray;
3037       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3038       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3039      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3040       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3041           allocated to accommodate different uses. So we get the length info directly from mat.
3042        */
3043       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3044         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3045         nx = mat->num_cols;
3046         ny = mat->num_rows;
3047       }
3048      #endif
3049     } else {
3050       /* z = A^T x + beta y
3051          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3052          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3053        */
3054       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3055       dptr = zarray;
3056       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3057       if (compressed) { /* Scatter x to work vector */
3058         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3059         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3060                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3061                          VecCUDAEqualsReverse());
3062       }
3063      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3064       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3065         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3066         nx = mat->num_rows;
3067         ny = mat->num_cols;
3068       }
3069      #endif
3070     }
3071 
3072     /* csr_spmv does y = alpha op(A) x + beta y */
3073     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3074      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3075       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3076       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3077         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3078         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3079         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3080                                                matstruct->matDescr,
3081                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3082                                                matstruct->cuSpMV[opA].vecYDescr,
3083                                                cusparse_scalartype,
3084                                                cusparsestruct->spmvAlg,
3085                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3086         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3087 
3088         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3089       } else {
3090         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3091         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3092         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3093       }
3094 
3095       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3096                                   matstruct->alpha_one,
3097                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3098                                   matstruct->cuSpMV[opA].vecXDescr,
3099                                   beta,
3100                                   matstruct->cuSpMV[opA].vecYDescr,
3101                                   cusparse_scalartype,
3102                                   cusparsestruct->spmvAlg,
3103                                   matstruct->cuSpMV[opA].spmvBuffer));
3104      #else
3105       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3106       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3107                                        mat->num_rows, mat->num_cols,
3108                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3109                                        mat->values->data().get(), mat->row_offsets->data().get(),
3110                                        mat->column_indices->data().get(), xptr, beta,
3111                                        dptr));
3112      #endif
3113     } else {
3114       if (cusparsestruct->nrows) {
3115        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3116         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3117        #else
3118         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3119         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3120                                          matstruct->alpha_one, matstruct->descr, hybMat,
3121                                          xptr, beta,
3122                                          dptr));
3123        #endif
3124       }
3125     }
3126     PetscCall(PetscLogGpuTimeEnd());
3127 
3128     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3129       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3130         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3131           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3132         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3133           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3134         }
3135       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3136         PetscCall(VecSet_SeqCUDA(zz,0));
3137       }
3138 
3139       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3140       if (compressed) {
3141         PetscCall(PetscLogGpuTimeBegin());
3142         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3143            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3144            prevent that. So I just add a ScatterAdd kernel.
3145          */
3146        #if 0
3147         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3148         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3149                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3150                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3151                          VecCUDAPlusEquals());
3152        #else
3153         PetscInt n = matstruct->cprowIndices->size();
3154         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3155        #endif
3156         PetscCall(PetscLogGpuTimeEnd());
3157       }
3158     } else {
3159       if (yy && yy != zz) {
3160         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3161       }
3162     }
3163     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3164     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3165     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3166   } catch(char *ex) {
3167     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3168   }
3169   if (yy) {
3170     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3171   } else {
3172     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3173   }
3174   PetscFunctionReturn(0);
3175 }
3176 
3177 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3178 {
3179   PetscFunctionBegin;
3180   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3181   PetscFunctionReturn(0);
3182 }
3183 
3184 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3185 {
3186   PetscObjectState   onnz = A->nonzerostate;
3187   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3188 
3189   PetscFunctionBegin;
3190   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3191   if (onnz != A->nonzerostate && cusp->deviceMat) {
3192 
3193     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3194     PetscCallCUDA(cudaFree(cusp->deviceMat));
3195     cusp->deviceMat = NULL;
3196   }
3197   PetscFunctionReturn(0);
3198 }
3199 
3200 /* --------------------------------------------------------------------------------*/
3201 /*@
3202    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3203    (the default parallel PETSc format). This matrix will ultimately pushed down
3204    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3205    assembly performance the user should preallocate the matrix storage by setting
3206    the parameter nz (or the array nnz).  By setting these parameters accurately,
3207    performance during matrix assembly can be increased by more than a factor of 50.
3208 
3209    Collective
3210 
3211    Input Parameters:
3212 +  comm - MPI communicator, set to PETSC_COMM_SELF
3213 .  m - number of rows
3214 .  n - number of columns
3215 .  nz - number of nonzeros per row (same for all rows)
3216 -  nnz - array containing the number of nonzeros in the various rows
3217          (possibly different for each row) or NULL
3218 
3219    Output Parameter:
3220 .  A - the matrix
3221 
3222    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3223    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3224    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3225 
3226    Notes:
3227    If nnz is given then nz is ignored
3228 
3229    The AIJ format (also called the Yale sparse matrix format or
3230    compressed row storage), is fully compatible with standard Fortran 77
3231    storage.  That is, the stored row and column indices can begin at
3232    either one (as in Fortran) or zero.  See the users' manual for details.
3233 
3234    Specify the preallocated storage with either nz or nnz (not both).
3235    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3236    allocation.  For large problems you MUST preallocate memory or you
3237    will get TERRIBLE performance, see the users' manual chapter on matrices.
3238 
3239    By default, this format uses inodes (identical nodes) when possible, to
3240    improve numerical efficiency of matrix-vector products and solves. We
3241    search for consecutive rows with the same nonzero structure, thereby
3242    reusing matrix information to achieve increased efficiency.
3243 
3244    Level: intermediate
3245 
3246 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3247 @*/
3248 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3249 {
3250   PetscFunctionBegin;
3251   PetscCall(MatCreate(comm,A));
3252   PetscCall(MatSetSizes(*A,m,n,m,n));
3253   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3254   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3255   PetscFunctionReturn(0);
3256 }
3257 
3258 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3259 {
3260   PetscFunctionBegin;
3261   if (A->factortype == MAT_FACTOR_NONE) {
3262     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3263   } else {
3264     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3265   }
3266   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3267   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3268   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3269   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3270   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3271   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3272   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3273   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3274   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3275   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3276   PetscCall(MatDestroy_SeqAIJ(A));
3277   PetscFunctionReturn(0);
3278 }
3279 
3280 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3281 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3282 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3283 {
3284   PetscFunctionBegin;
3285   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3286   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3287   PetscFunctionReturn(0);
3288 }
3289 
3290 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3291 {
3292   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3293   Mat_SeqAIJCUSPARSE *cy;
3294   Mat_SeqAIJCUSPARSE *cx;
3295   PetscScalar        *ay;
3296   const PetscScalar  *ax;
3297   CsrMatrix          *csry,*csrx;
3298 
3299   PetscFunctionBegin;
3300   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3301   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3302   if (X->ops->axpy != Y->ops->axpy) {
3303     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3304     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3305     PetscFunctionReturn(0);
3306   }
3307   /* if we are here, it means both matrices are bound to GPU */
3308   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3309   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3310   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3311   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3312   csry = (CsrMatrix*)cy->mat->mat;
3313   csrx = (CsrMatrix*)cx->mat->mat;
3314   /* see if we can turn this into a cublas axpy */
3315   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3316     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3317     if (eq) {
3318       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3319     }
3320     if (eq) str = SAME_NONZERO_PATTERN;
3321   }
3322   /* spgeam is buggy with one column */
3323   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3324 
3325   if (str == SUBSET_NONZERO_PATTERN) {
3326     PetscScalar b = 1.0;
3327 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3328     size_t      bufferSize;
3329     void        *buffer;
3330 #endif
3331 
3332     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3333     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3334     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3336     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3337                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3338                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3339                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3340     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3341     PetscCall(PetscLogGpuTimeBegin());
3342     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3343                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3344                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3345                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3346     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3347     PetscCall(PetscLogGpuTimeEnd());
3348     PetscCallCUDA(cudaFree(buffer));
3349 #else
3350     PetscCall(PetscLogGpuTimeBegin());
3351     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3352                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3353                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3354                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3355     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3356     PetscCall(PetscLogGpuTimeEnd());
3357 #endif
3358     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3359     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3360     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3361     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3362   } else if (str == SAME_NONZERO_PATTERN) {
3363     cublasHandle_t cublasv2handle;
3364     PetscBLASInt   one = 1, bnz = 1;
3365 
3366     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3367     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3368     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3369     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3370     PetscCall(PetscLogGpuTimeBegin());
3371     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3372     PetscCall(PetscLogGpuFlops(2.0*bnz));
3373     PetscCall(PetscLogGpuTimeEnd());
3374     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3375     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3376     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3377   } else {
3378     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3379     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3380   }
3381   PetscFunctionReturn(0);
3382 }
3383 
3384 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3385 {
3386   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3387   PetscScalar    *ay;
3388   cublasHandle_t cublasv2handle;
3389   PetscBLASInt   one = 1, bnz = 1;
3390 
3391   PetscFunctionBegin;
3392   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3393   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3394   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3395   PetscCall(PetscLogGpuTimeBegin());
3396   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3397   PetscCall(PetscLogGpuFlops(bnz));
3398   PetscCall(PetscLogGpuTimeEnd());
3399   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3400   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3401   PetscFunctionReturn(0);
3402 }
3403 
3404 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3405 {
3406   PetscBool      both = PETSC_FALSE;
3407   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3408 
3409   PetscFunctionBegin;
3410   if (A->factortype == MAT_FACTOR_NONE) {
3411     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3412     if (spptr->mat) {
3413       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3414       if (matrix->values) {
3415         both = PETSC_TRUE;
3416         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3417       }
3418     }
3419     if (spptr->matTranspose) {
3420       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3421       if (matrix->values) {
3422         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3423       }
3424     }
3425   }
3426   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3427   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3428   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3429   else A->offloadmask = PETSC_OFFLOAD_CPU;
3430   PetscFunctionReturn(0);
3431 }
3432 
3433 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3434 {
3435   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3436 
3437   PetscFunctionBegin;
3438   if (A->factortype != MAT_FACTOR_NONE) {
3439     A->boundtocpu = flg;
3440     PetscFunctionReturn(0);
3441   }
3442   if (flg) {
3443     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3444 
3445     A->ops->scale                     = MatScale_SeqAIJ;
3446     A->ops->axpy                      = MatAXPY_SeqAIJ;
3447     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3448     A->ops->mult                      = MatMult_SeqAIJ;
3449     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3450     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3451     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3452     A->ops->multhermitiantranspose    = NULL;
3453     A->ops->multhermitiantransposeadd = NULL;
3454     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3455     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3456     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3457     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3458     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3459     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3460     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3461     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
3462     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3463   } else {
3464     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3465     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3466     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3467     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3468     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3469     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3470     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3471     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3472     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3473     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3474     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3475     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3476     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3477     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3478     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3479     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3480     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3481 
3482     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3483     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3484     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3485     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3486     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3487     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3488    }
3489   A->boundtocpu = flg;
3490   if (flg && a->inode.size) {
3491     a->inode.use = PETSC_TRUE;
3492   } else {
3493     a->inode.use = PETSC_FALSE;
3494   }
3495   PetscFunctionReturn(0);
3496 }
3497 
3498 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3499 {
3500   Mat              B;
3501 
3502   PetscFunctionBegin;
3503   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3504   if (reuse == MAT_INITIAL_MATRIX) {
3505     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3506   } else if (reuse == MAT_REUSE_MATRIX) {
3507     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3508   }
3509   B = *newmat;
3510 
3511   PetscCall(PetscFree(B->defaultvectype));
3512   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3513 
3514   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3515     if (B->factortype == MAT_FACTOR_NONE) {
3516       Mat_SeqAIJCUSPARSE *spptr;
3517       PetscCall(PetscNew(&spptr));
3518       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3519       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3520       spptr->format     = MAT_CUSPARSE_CSR;
3521      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3522      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3523       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3524      #else
3525       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3526      #endif
3527       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3528       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3529      #endif
3530       B->spptr = spptr;
3531     } else {
3532       Mat_SeqAIJCUSPARSETriFactors *spptr;
3533 
3534       PetscCall(PetscNew(&spptr));
3535       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3536       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3537       B->spptr = spptr;
3538     }
3539     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3540   }
3541   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3542   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3543   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3544   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3545   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3546   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3547 
3548   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3549   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3550   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3551 #if defined(PETSC_HAVE_HYPRE)
3552   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3553 #endif
3554   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3555   PetscFunctionReturn(0);
3556 }
3557 
3558 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3559 {
3560   PetscFunctionBegin;
3561   PetscCall(MatCreate_SeqAIJ(B));
3562   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3563   PetscFunctionReturn(0);
3564 }
3565 
3566 /*MC
3567    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3568 
3569    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3570    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3571    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3572 
3573    Options Database Keys:
3574 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3575 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3576 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3577 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3578 
3579   Level: beginner
3580 
3581 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3582 M*/
3583 
3584 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3585 
3586 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3587 {
3588   PetscFunctionBegin;
3589   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3590   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3591   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3592   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3593   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3594 
3595   PetscFunctionReturn(0);
3596 }
3597 
3598 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3599 {
3600   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3601 
3602   PetscFunctionBegin;
3603   if (!cusp) PetscFunctionReturn(0);
3604   delete cusp->cooPerm;
3605   delete cusp->cooPerm_a;
3606   cusp->cooPerm = NULL;
3607   cusp->cooPerm_a = NULL;
3608   if (cusp->use_extended_coo) {
3609     PetscCallCUDA(cudaFree(cusp->jmap_d));
3610     PetscCallCUDA(cudaFree(cusp->perm_d));
3611   }
3612   cusp->use_extended_coo = PETSC_FALSE;
3613   PetscFunctionReturn(0);
3614 }
3615 
3616 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3617 {
3618   PetscFunctionBegin;
3619   if (*cusparsestruct) {
3620     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3621     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3622     delete (*cusparsestruct)->workVector;
3623     delete (*cusparsestruct)->rowoffsets_gpu;
3624     delete (*cusparsestruct)->cooPerm;
3625     delete (*cusparsestruct)->cooPerm_a;
3626     delete (*cusparsestruct)->csr2csc_i;
3627     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3628     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3629     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3630     PetscCall(PetscFree(*cusparsestruct));
3631   }
3632   PetscFunctionReturn(0);
3633 }
3634 
3635 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3636 {
3637   PetscFunctionBegin;
3638   if (*mat) {
3639     delete (*mat)->values;
3640     delete (*mat)->column_indices;
3641     delete (*mat)->row_offsets;
3642     delete *mat;
3643     *mat = 0;
3644   }
3645   PetscFunctionReturn(0);
3646 }
3647 
3648 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3649 {
3650   PetscFunctionBegin;
3651   if (*trifactor) {
3652     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3653     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3654     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3655     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3656     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3657    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3658     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3659    #endif
3660     PetscCall(PetscFree(*trifactor));
3661   }
3662   PetscFunctionReturn(0);
3663 }
3664 
3665 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3666 {
3667   CsrMatrix        *mat;
3668 
3669   PetscFunctionBegin;
3670   if (*matstruct) {
3671     if ((*matstruct)->mat) {
3672       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3673        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3674         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3675        #else
3676         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3677         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3678        #endif
3679       } else {
3680         mat = (CsrMatrix*)(*matstruct)->mat;
3681         CsrMatrix_Destroy(&mat);
3682       }
3683     }
3684     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3685     delete (*matstruct)->cprowIndices;
3686     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3687     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3688     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3689 
3690    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3691     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3692     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3693     for (int i=0; i<3; i++) {
3694       if (mdata->cuSpMV[i].initialized) {
3695         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3696         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3697         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3698       }
3699     }
3700    #endif
3701     delete *matstruct;
3702     *matstruct = NULL;
3703   }
3704   PetscFunctionReturn(0);
3705 }
3706 
3707 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3708 {
3709   PetscFunctionBegin;
3710   if (*trifactors) {
3711     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3712     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3713     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3714     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3715     delete (*trifactors)->rpermIndices;
3716     delete (*trifactors)->cpermIndices;
3717     delete (*trifactors)->workVector;
3718     (*trifactors)->rpermIndices = NULL;
3719     (*trifactors)->cpermIndices = NULL;
3720     (*trifactors)->workVector = NULL;
3721     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3722     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3723     (*trifactors)->init_dev_prop = PETSC_FALSE;
3724   }
3725   PetscFunctionReturn(0);
3726 }
3727 
3728 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3729 {
3730   cusparseHandle_t handle;
3731 
3732   PetscFunctionBegin;
3733   if (*trifactors) {
3734     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3735     if (handle = (*trifactors)->handle) {
3736       PetscCallCUSPARSE(cusparseDestroy(handle));
3737     }
3738     PetscCall(PetscFree(*trifactors));
3739   }
3740   PetscFunctionReturn(0);
3741 }
3742 
3743 struct IJCompare
3744 {
3745   __host__ __device__
3746   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3747   {
3748     if (t1.get<0>() < t2.get<0>()) return true;
3749     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3750     return false;
3751   }
3752 };
3753 
3754 struct IJEqual
3755 {
3756   __host__ __device__
3757   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3758   {
3759     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3760     return true;
3761   }
3762 };
3763 
3764 struct IJDiff
3765 {
3766   __host__ __device__
3767   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3768   {
3769     return t1 == t2 ? 0 : 1;
3770   }
3771 };
3772 
3773 struct IJSum
3774 {
3775   __host__ __device__
3776   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3777   {
3778     return t1||t2;
3779   }
3780 };
3781 
3782 #include <thrust/iterator/discard_iterator.h>
3783 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3784 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3785 {
3786   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3787   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3788   THRUSTARRAY                           *cooPerm_v = NULL;
3789   thrust::device_ptr<const PetscScalar> d_v;
3790   CsrMatrix                             *matrix;
3791   PetscInt                              n;
3792 
3793   PetscFunctionBegin;
3794   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3795   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3796   if (!cusp->cooPerm) {
3797     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3798     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3799     PetscFunctionReturn(0);
3800   }
3801   matrix = (CsrMatrix*)cusp->mat->mat;
3802   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3803   if (!v) {
3804     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3805     goto finalize;
3806   }
3807   n = cusp->cooPerm->size();
3808   if (isCudaMem(v)) {
3809     d_v = thrust::device_pointer_cast(v);
3810   } else {
3811     cooPerm_v = new THRUSTARRAY(n);
3812     cooPerm_v->assign(v,v+n);
3813     d_v = cooPerm_v->data();
3814     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3815   }
3816   PetscCall(PetscLogGpuTimeBegin());
3817   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3818     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3819       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3820       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3821       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3822         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3823         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3824       */
3825       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3826       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3827       delete cooPerm_w;
3828     } else {
3829       /* all nonzeros in d_v[] are unique entries */
3830       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3831                                                                 matrix->values->begin()));
3832       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3833                                                                 matrix->values->end()));
3834       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3835     }
3836   } else {
3837     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3838       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3839       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3840     } else {
3841       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3842                                                                 matrix->values->begin()));
3843       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3844                                                                 matrix->values->end()));
3845       thrust::for_each(zibit,zieit,VecCUDAEquals());
3846     }
3847   }
3848   PetscCall(PetscLogGpuTimeEnd());
3849 finalize:
3850   delete cooPerm_v;
3851   A->offloadmask = PETSC_OFFLOAD_GPU;
3852   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3853   /* shorter version of MatAssemblyEnd_SeqAIJ */
3854   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3855   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3856   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3857   a->reallocs         = 0;
3858   A->info.mallocs    += 0;
3859   A->info.nz_unneeded = 0;
3860   A->assembled = A->was_assembled = PETSC_TRUE;
3861   A->num_ass++;
3862   PetscFunctionReturn(0);
3863 }
3864 
3865 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3866 {
3867   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3868 
3869   PetscFunctionBegin;
3870   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3871   if (!cusp) PetscFunctionReturn(0);
3872   if (destroy) {
3873     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3874     delete cusp->csr2csc_i;
3875     cusp->csr2csc_i = NULL;
3876   }
3877   A->transupdated = PETSC_FALSE;
3878   PetscFunctionReturn(0);
3879 }
3880 
3881 #include <thrust/binary_search.h>
3882 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3883 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3884 {
3885   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3886   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3887   PetscInt           cooPerm_n, nzr = 0;
3888 
3889   PetscFunctionBegin;
3890   PetscCall(PetscLayoutSetUp(A->rmap));
3891   PetscCall(PetscLayoutSetUp(A->cmap));
3892   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3893   if (n != cooPerm_n) {
3894     delete cusp->cooPerm;
3895     delete cusp->cooPerm_a;
3896     cusp->cooPerm = NULL;
3897     cusp->cooPerm_a = NULL;
3898   }
3899   if (n) {
3900     THRUSTINTARRAY d_i(n);
3901     THRUSTINTARRAY d_j(n);
3902     THRUSTINTARRAY ii(A->rmap->n);
3903 
3904     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3905     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3906 
3907     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3908     d_i.assign(coo_i,coo_i+n);
3909     d_j.assign(coo_j,coo_j+n);
3910 
3911     /* Ex.
3912       n = 6
3913       coo_i = [3,3,1,4,1,4]
3914       coo_j = [3,2,2,5,2,6]
3915     */
3916     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3917     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3918 
3919     PetscCall(PetscLogGpuTimeBegin());
3920     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3921     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3922     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3923     THRUSTINTARRAY w = d_j;
3924 
3925     /*
3926       d_i     = [1,1,3,3,4,4]
3927       d_j     = [2,2,2,3,5,6]
3928       cooPerm = [2,4,1,0,3,5]
3929     */
3930     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3931 
3932     /*
3933       d_i     = [1,3,3,4,4,x]
3934                             ^ekey
3935       d_j     = [2,2,3,5,6,x]
3936                            ^nekye
3937     */
3938     if (nekey == ekey) { /* all entries are unique */
3939       delete cusp->cooPerm_a;
3940       cusp->cooPerm_a = NULL;
3941     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3942       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3943       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3944       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3945       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3946       w[0] = 0;
3947       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3948       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3949     }
3950     thrust::counting_iterator<PetscInt> search_begin(0);
3951     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3952                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3953                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3954     PetscCall(PetscLogGpuTimeEnd());
3955 
3956     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3957     a->singlemalloc = PETSC_FALSE;
3958     a->free_a       = PETSC_TRUE;
3959     a->free_ij      = PETSC_TRUE;
3960     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3961     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3962     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3963     a->nz = a->maxnz = a->i[A->rmap->n];
3964     a->rmax = 0;
3965     PetscCall(PetscMalloc1(a->nz,&a->a));
3966     PetscCall(PetscMalloc1(a->nz,&a->j));
3967     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3968     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3969     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3970     for (PetscInt i = 0; i < A->rmap->n; i++) {
3971       const PetscInt nnzr = a->i[i+1] - a->i[i];
3972       nzr += (PetscInt)!!(nnzr);
3973       a->ilen[i] = a->imax[i] = nnzr;
3974       a->rmax = PetscMax(a->rmax,nnzr);
3975     }
3976     a->nonzerorowcnt = nzr;
3977     A->preallocated = PETSC_TRUE;
3978     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3979     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3980   } else {
3981     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3982   }
3983   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3984 
3985   /* We want to allocate the CUSPARSE struct for matvec now.
3986      The code is so convoluted now that I prefer to copy zeros */
3987   PetscCall(PetscArrayzero(a->a,a->nz));
3988   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3989   A->offloadmask = PETSC_OFFLOAD_CPU;
3990   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3991   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3992   PetscFunctionReturn(0);
3993 }
3994 
3995 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3996 {
3997   Mat_SeqAIJ         *seq;
3998   Mat_SeqAIJCUSPARSE *dev;
3999   PetscBool          coo_basic = PETSC_TRUE;
4000   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4001 
4002   PetscFunctionBegin;
4003   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4004   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4005   if (coo_i) {
4006     PetscCall(PetscGetMemType(coo_i,&mtype));
4007     if (PetscMemTypeHost(mtype)) {
4008       for (PetscCount k=0; k<coo_n; k++) {
4009         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4010       }
4011     }
4012   }
4013 
4014   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4015     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4016   } else {
4017     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4018     mat->offloadmask = PETSC_OFFLOAD_CPU;
4019     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4020     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4021     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4022     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4023     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4024     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4025     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4026     dev->use_extended_coo = PETSC_TRUE;
4027   }
4028   PetscFunctionReturn(0);
4029 }
4030 
4031 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4032 {
4033   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4034   const PetscCount  grid_size = gridDim.x * blockDim.x;
4035   for (; i<nnz; i+= grid_size) {
4036     PetscScalar sum = 0.0;
4037     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4038     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4039   }
4040 }
4041 
4042 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4043 {
4044   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4045   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4046   PetscCount          Annz = seq->nz;
4047   PetscMemType        memtype;
4048   const PetscScalar   *v1 = v;
4049   PetscScalar         *Aa;
4050 
4051   PetscFunctionBegin;
4052   if (dev->use_extended_coo) {
4053     PetscCall(PetscGetMemType(v,&memtype));
4054     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4055       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4056       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4057     }
4058 
4059     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4060     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4061 
4062     if (Annz) {
4063       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4064       PetscCallCUDA(cudaPeekAtLastError());
4065     }
4066 
4067     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4068     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4069 
4070     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4071   } else {
4072     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4073   }
4074   PetscFunctionReturn(0);
4075 }
4076 
4077 /*@C
4078     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4079 
4080    Not collective
4081 
4082     Input Parameters:
4083 +   A - the matrix
4084 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4085 
4086     Output Parameters:
4087 +   ia - the CSR row pointers
4088 -   ja - the CSR column indices
4089 
4090     Level: developer
4091 
4092     Notes:
4093       When compressed is true, the CSR structure does not contain empty rows
4094 
4095 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4096 @*/
4097 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4098 {
4099   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4100   CsrMatrix          *csr;
4101   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4102 
4103   PetscFunctionBegin;
4104   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4105   if (!i || !j) PetscFunctionReturn(0);
4106   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4107   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4108   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4109   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4110   csr = (CsrMatrix*)cusp->mat->mat;
4111   if (i) {
4112     if (!compressed && a->compressedrow.use) { /* need full row offset */
4113       if (!cusp->rowoffsets_gpu) {
4114         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4115         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4116         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4117       }
4118       *i = cusp->rowoffsets_gpu->data().get();
4119     } else *i = csr->row_offsets->data().get();
4120   }
4121   if (j) *j = csr->column_indices->data().get();
4122   PetscFunctionReturn(0);
4123 }
4124 
4125 /*@C
4126     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4127 
4128    Not collective
4129 
4130     Input Parameters:
4131 +   A - the matrix
4132 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4133 
4134     Output Parameters:
4135 +   ia - the CSR row pointers
4136 -   ja - the CSR column indices
4137 
4138     Level: developer
4139 
4140 .seealso: MatSeqAIJCUSPARSEGetIJ()
4141 @*/
4142 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4143 {
4144   PetscFunctionBegin;
4145   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4146   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4147   if (i) *i = NULL;
4148   if (j) *j = NULL;
4149   PetscFunctionReturn(0);
4150 }
4151 
4152 /*@C
4153    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4154 
4155    Not Collective
4156 
4157    Input Parameter:
4158 .   A - a MATSEQAIJCUSPARSE matrix
4159 
4160    Output Parameter:
4161 .   a - pointer to the device data
4162 
4163    Level: developer
4164 
4165    Notes: may trigger host-device copies if up-to-date matrix data is on host
4166 
4167 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4168 @*/
4169 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4170 {
4171   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4172   CsrMatrix          *csr;
4173 
4174   PetscFunctionBegin;
4175   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4176   PetscValidPointer(a,2);
4177   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4178   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4179   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4180   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4181   csr = (CsrMatrix*)cusp->mat->mat;
4182   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4183   *a = csr->values->data().get();
4184   PetscFunctionReturn(0);
4185 }
4186 
4187 /*@C
4188    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4189 
4190    Not Collective
4191 
4192    Input Parameter:
4193 .   A - a MATSEQAIJCUSPARSE matrix
4194 
4195    Output Parameter:
4196 .   a - pointer to the device data
4197 
4198    Level: developer
4199 
4200 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4201 @*/
4202 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4203 {
4204   PetscFunctionBegin;
4205   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4206   PetscValidPointer(a,2);
4207   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4208   *a = NULL;
4209   PetscFunctionReturn(0);
4210 }
4211 
4212 /*@C
4213    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4214 
4215    Not Collective
4216 
4217    Input Parameter:
4218 .   A - a MATSEQAIJCUSPARSE matrix
4219 
4220    Output Parameter:
4221 .   a - pointer to the device data
4222 
4223    Level: developer
4224 
4225    Notes: may trigger host-device copies if up-to-date matrix data is on host
4226 
4227 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4228 @*/
4229 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4230 {
4231   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4232   CsrMatrix          *csr;
4233 
4234   PetscFunctionBegin;
4235   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4236   PetscValidPointer(a,2);
4237   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4238   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4239   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4240   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4241   csr = (CsrMatrix*)cusp->mat->mat;
4242   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4243   *a = csr->values->data().get();
4244   A->offloadmask = PETSC_OFFLOAD_GPU;
4245   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4246   PetscFunctionReturn(0);
4247 }
4248 /*@C
4249    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4250 
4251    Not Collective
4252 
4253    Input Parameter:
4254 .   A - a MATSEQAIJCUSPARSE matrix
4255 
4256    Output Parameter:
4257 .   a - pointer to the device data
4258 
4259    Level: developer
4260 
4261 .seealso: MatSeqAIJCUSPARSEGetArray()
4262 @*/
4263 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4264 {
4265   PetscFunctionBegin;
4266   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4267   PetscValidPointer(a,2);
4268   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4269   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4270   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4271   *a = NULL;
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 /*@C
4276    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4277 
4278    Not Collective
4279 
4280    Input Parameter:
4281 .   A - a MATSEQAIJCUSPARSE matrix
4282 
4283    Output Parameter:
4284 .   a - pointer to the device data
4285 
4286    Level: developer
4287 
4288    Notes: does not trigger host-device copies and flags data validity on the GPU
4289 
4290 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4291 @*/
4292 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4293 {
4294   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4295   CsrMatrix          *csr;
4296 
4297   PetscFunctionBegin;
4298   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4299   PetscValidPointer(a,2);
4300   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4301   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4302   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4303   csr = (CsrMatrix*)cusp->mat->mat;
4304   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4305   *a = csr->values->data().get();
4306   A->offloadmask = PETSC_OFFLOAD_GPU;
4307   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4308   PetscFunctionReturn(0);
4309 }
4310 
4311 /*@C
4312    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4313 
4314    Not Collective
4315 
4316    Input Parameter:
4317 .   A - a MATSEQAIJCUSPARSE matrix
4318 
4319    Output Parameter:
4320 .   a - pointer to the device data
4321 
4322    Level: developer
4323 
4324 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4325 @*/
4326 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4327 {
4328   PetscFunctionBegin;
4329   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4330   PetscValidPointer(a,2);
4331   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4332   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4333   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4334   *a = NULL;
4335   PetscFunctionReturn(0);
4336 }
4337 
4338 struct IJCompare4
4339 {
4340   __host__ __device__
4341   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4342   {
4343     if (t1.get<0>() < t2.get<0>()) return true;
4344     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4345     return false;
4346   }
4347 };
4348 
4349 struct Shift
4350 {
4351   int _shift;
4352 
4353   Shift(int shift) : _shift(shift) {}
4354   __host__ __device__
4355   inline int operator() (const int &c)
4356   {
4357     return c + _shift;
4358   }
4359 };
4360 
4361 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4362 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4363 {
4364   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4365   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4366   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4367   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4368   PetscInt                     Annz,Bnnz;
4369   cusparseStatus_t             stat;
4370   PetscInt                     i,m,n,zero = 0;
4371 
4372   PetscFunctionBegin;
4373   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4374   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4375   PetscValidPointer(C,4);
4376   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4377   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4378   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4379   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4380   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4381   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4382   if (reuse == MAT_INITIAL_MATRIX) {
4383     m     = A->rmap->n;
4384     n     = A->cmap->n + B->cmap->n;
4385     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4386     PetscCall(MatSetSizes(*C,m,n,m,n));
4387     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4388     c     = (Mat_SeqAIJ*)(*C)->data;
4389     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4390     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4391     Ccsr  = new CsrMatrix;
4392     Cmat->cprowIndices      = NULL;
4393     c->compressedrow.use    = PETSC_FALSE;
4394     c->compressedrow.nrows  = 0;
4395     c->compressedrow.i      = NULL;
4396     c->compressedrow.rindex = NULL;
4397     Ccusp->workVector       = NULL;
4398     Ccusp->nrows    = m;
4399     Ccusp->mat      = Cmat;
4400     Ccusp->mat->mat = Ccsr;
4401     Ccsr->num_rows  = m;
4402     Ccsr->num_cols  = n;
4403     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4404     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4405     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4406     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4407     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4408     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4409     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4410     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4411     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4412     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4413     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4414     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4415     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4416 
4417     Acsr = (CsrMatrix*)Acusp->mat->mat;
4418     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4419     Annz = (PetscInt)Acsr->column_indices->size();
4420     Bnnz = (PetscInt)Bcsr->column_indices->size();
4421     c->nz = Annz + Bnnz;
4422     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4423     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4424     Ccsr->values = new THRUSTARRAY(c->nz);
4425     Ccsr->num_entries = c->nz;
4426     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4427     if (c->nz) {
4428       auto Acoo = new THRUSTINTARRAY32(Annz);
4429       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4430       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4431       THRUSTINTARRAY32 *Aroff,*Broff;
4432 
4433       if (a->compressedrow.use) { /* need full row offset */
4434         if (!Acusp->rowoffsets_gpu) {
4435           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4436           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4437           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4438         }
4439         Aroff = Acusp->rowoffsets_gpu;
4440       } else Aroff = Acsr->row_offsets;
4441       if (b->compressedrow.use) { /* need full row offset */
4442         if (!Bcusp->rowoffsets_gpu) {
4443           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4444           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4445           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4446         }
4447         Broff = Bcusp->rowoffsets_gpu;
4448       } else Broff = Bcsr->row_offsets;
4449       PetscCall(PetscLogGpuTimeBegin());
4450       stat = cusparseXcsr2coo(Acusp->handle,
4451                               Aroff->data().get(),
4452                               Annz,
4453                               m,
4454                               Acoo->data().get(),
4455                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4456       stat = cusparseXcsr2coo(Bcusp->handle,
4457                               Broff->data().get(),
4458                               Bnnz,
4459                               m,
4460                               Bcoo->data().get(),
4461                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4462       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4463       auto Aperm = thrust::make_constant_iterator(1);
4464       auto Bperm = thrust::make_constant_iterator(0);
4465 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4466       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4467       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4468 #else
4469       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4470       auto Bcib = Bcsr->column_indices->begin();
4471       auto Bcie = Bcsr->column_indices->end();
4472       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4473 #endif
4474       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4475       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4476       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4477       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4478       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4479       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4480       auto p1 = Ccusp->cooPerm->begin();
4481       auto p2 = Ccusp->cooPerm->begin();
4482       thrust::advance(p2,Annz);
4483       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4484 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4485       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4486 #endif
4487       auto cci = thrust::make_counting_iterator(zero);
4488       auto cce = thrust::make_counting_iterator(c->nz);
4489 #if 0 //Errors on SUMMIT cuda 11.1.0
4490       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4491 #else
4492       auto pred = thrust::identity<int>();
4493       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4494       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4495 #endif
4496       stat = cusparseXcoo2csr(Ccusp->handle,
4497                               Ccoo->data().get(),
4498                               c->nz,
4499                               m,
4500                               Ccsr->row_offsets->data().get(),
4501                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4502       PetscCall(PetscLogGpuTimeEnd());
4503       delete wPerm;
4504       delete Acoo;
4505       delete Bcoo;
4506       delete Ccoo;
4507 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4508       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4509                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4510                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4511                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4512 #endif
4513       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4514         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4515         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4516         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4517         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4518         CsrMatrix *CcsrT = new CsrMatrix;
4519         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4520         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4521 
4522         (*C)->form_explicit_transpose = PETSC_TRUE;
4523         (*C)->transupdated = PETSC_TRUE;
4524         Ccusp->rowoffsets_gpu = NULL;
4525         CmatT->cprowIndices = NULL;
4526         CmatT->mat = CcsrT;
4527         CcsrT->num_rows = n;
4528         CcsrT->num_cols = m;
4529         CcsrT->num_entries = c->nz;
4530 
4531         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4532         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4533         CcsrT->values = new THRUSTARRAY(c->nz);
4534 
4535         PetscCall(PetscLogGpuTimeBegin());
4536         auto rT = CcsrT->row_offsets->begin();
4537         if (AT) {
4538           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4539           thrust::advance(rT,-1);
4540         }
4541         if (BT) {
4542           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4543           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4544           thrust::copy(titb,tite,rT);
4545         }
4546         auto cT = CcsrT->column_indices->begin();
4547         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4548         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4549         auto vT = CcsrT->values->begin();
4550         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4551         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4552         PetscCall(PetscLogGpuTimeEnd());
4553 
4554         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4555         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4556         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4557         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4558         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4559         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4560         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4561         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4562         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4563 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4564         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4565                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4566                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4567                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4568 #endif
4569         Ccusp->matTranspose = CmatT;
4570       }
4571     }
4572 
4573     c->singlemalloc = PETSC_FALSE;
4574     c->free_a       = PETSC_TRUE;
4575     c->free_ij      = PETSC_TRUE;
4576     PetscCall(PetscMalloc1(m+1,&c->i));
4577     PetscCall(PetscMalloc1(c->nz,&c->j));
4578     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4579       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4580       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4581       ii   = *Ccsr->row_offsets;
4582       jj   = *Ccsr->column_indices;
4583       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4584       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4585     } else {
4586       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4587       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4588     }
4589     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4590     PetscCall(PetscMalloc1(m,&c->ilen));
4591     PetscCall(PetscMalloc1(m,&c->imax));
4592     c->maxnz = c->nz;
4593     c->nonzerorowcnt = 0;
4594     c->rmax = 0;
4595     for (i = 0; i < m; i++) {
4596       const PetscInt nn = c->i[i+1] - c->i[i];
4597       c->ilen[i] = c->imax[i] = nn;
4598       c->nonzerorowcnt += (PetscInt)!!nn;
4599       c->rmax = PetscMax(c->rmax,nn);
4600     }
4601     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4602     PetscCall(PetscMalloc1(c->nz,&c->a));
4603     (*C)->nonzerostate++;
4604     PetscCall(PetscLayoutSetUp((*C)->rmap));
4605     PetscCall(PetscLayoutSetUp((*C)->cmap));
4606     Ccusp->nonzerostate = (*C)->nonzerostate;
4607     (*C)->preallocated  = PETSC_TRUE;
4608   } else {
4609     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4610     c = (Mat_SeqAIJ*)(*C)->data;
4611     if (c->nz) {
4612       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4613       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4614       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4615       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4616       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4617       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4618       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4619       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4620       Acsr = (CsrMatrix*)Acusp->mat->mat;
4621       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4622       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4623       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4624       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4625       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4626       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4627       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4628       auto pmid = Ccusp->cooPerm->begin();
4629       thrust::advance(pmid,Acsr->num_entries);
4630       PetscCall(PetscLogGpuTimeBegin());
4631       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4632                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4633       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4634                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4635       thrust::for_each(zibait,zieait,VecCUDAEquals());
4636       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4637                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4638       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4639                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4640       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4641       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4642       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4643         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4644         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4645         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4646         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4647         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4648         auto vT = CcsrT->values->begin();
4649         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4650         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4651         (*C)->transupdated = PETSC_TRUE;
4652       }
4653       PetscCall(PetscLogGpuTimeEnd());
4654     }
4655   }
4656   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4657   (*C)->assembled     = PETSC_TRUE;
4658   (*C)->was_assembled = PETSC_FALSE;
4659   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4660   PetscFunctionReturn(0);
4661 }
4662 
4663 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4664 {
4665   bool              dmem;
4666   const PetscScalar *av;
4667 
4668   PetscFunctionBegin;
4669   dmem = isCudaMem(v);
4670   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4671   if (n && idx) {
4672     THRUSTINTARRAY widx(n);
4673     widx.assign(idx,idx+n);
4674     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4675 
4676     THRUSTARRAY *w = NULL;
4677     thrust::device_ptr<PetscScalar> dv;
4678     if (dmem) {
4679       dv = thrust::device_pointer_cast(v);
4680     } else {
4681       w = new THRUSTARRAY(n);
4682       dv = w->data();
4683     }
4684     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4685 
4686     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4687     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4688     thrust::for_each(zibit,zieit,VecCUDAEquals());
4689     if (w) {
4690       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4691     }
4692     delete w;
4693   } else {
4694     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4695   }
4696   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4697   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4698   PetscFunctionReturn(0);
4699 }
4700