xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 088d793e4684837a6632086c6b8c2d5d41ebecc0)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
96 {
97   cusparseStatus_t   stat;
98   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
99 
100   PetscFunctionBegin;
101   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
102   cusparsestruct->stream = stream;
103   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
104   PetscFunctionReturn(0);
105 }
106 
107 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
108 {
109   cusparseStatus_t   stat;
110   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
111 
112   PetscFunctionBegin;
113   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
114   if (cusparsestruct->handle != handle) {
115     if (cusparsestruct->handle) {
116       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
117     }
118     cusparsestruct->handle = handle;
119   }
120   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
121   PetscFunctionReturn(0);
122 }
123 
124 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127   PetscBool          flg;
128   PetscErrorCode     ierr;
129 
130   PetscFunctionBegin;
131   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
132   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
133   if (cusparsestruct->handle) cusparsestruct->handle = 0;
134   PetscFunctionReturn(0);
135 }
136 
137 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
138 {
139   PetscFunctionBegin;
140   *type = MATSOLVERCUSPARSE;
141   PetscFunctionReturn(0);
142 }
143 
144 /*MC
145   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
146   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
147   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
148   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
149   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
150   algorithms are not recommended. This class does NOT support direct solver operations.
151 
152   Level: beginner
153 
154 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
155 M*/
156 
157 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
158 {
159   PetscErrorCode ierr;
160   PetscInt       n = A->rmap->n;
161 
162   PetscFunctionBegin;
163   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
164   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
165   (*B)->factortype = ftype;
166   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
167 
168   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
169   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
170     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
171     if (!A->boundtocpu) {
172       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
173       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
174     } else {
175       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
176       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
177     }
178     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
179     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
180     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
181   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
182     if (!A->boundtocpu) {
183       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
184       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
185     } else {
186       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
187       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
188     }
189     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
190     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
191   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
192 
193   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
194   (*B)->canuseordering = PETSC_TRUE;
195   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200 {
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203   PetscFunctionBegin;
204   switch (op) {
205   case MAT_CUSPARSE_MULT:
206     cusparsestruct->format = format;
207     break;
208   case MAT_CUSPARSE_ALL:
209     cusparsestruct->format = format;
210     break;
211   default:
212     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
213   }
214   PetscFunctionReturn(0);
215 }
216 
217 /*@
218    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
219    operation. Only the MatMult operation can use different GPU storage formats
220    for MPIAIJCUSPARSE matrices.
221    Not Collective
222 
223    Input Parameters:
224 +  A - Matrix of type SEQAIJCUSPARSE
225 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
226 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
227 
228    Output Parameter:
229 
230    Level: intermediate
231 
232 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
233 @*/
234 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
235 {
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
240   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
245 {
246   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
247 
248   PetscFunctionBegin;
249   cusparsestruct->use_cpu_solve = use_cpu;
250   PetscFunctionReturn(0);
251 }
252 
253 /*@
254    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
255 
256    Input Parameters:
257 +  A - Matrix of type SEQAIJCUSPARSE
258 -  use_cpu - set flag for using the built-in CPU MatSolve
259 
260    Output Parameter:
261 
262    Notes:
263    The cuSparse LU solver currently computes the factors with the built-in CPU method
264    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
265    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
266 
267    Level: intermediate
268 
269 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
270 @*/
271 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
272 {
273   PetscErrorCode ierr;
274 
275   PetscFunctionBegin;
276   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
277   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
282 {
283   PetscErrorCode ierr;
284 
285   PetscFunctionBegin;
286   switch (op) {
287     case MAT_FORM_EXPLICIT_TRANSPOSE:
288       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
289       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
290       A->form_explicit_transpose = flg;
291       break;
292     default:
293       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
294       break;
295   }
296   PetscFunctionReturn(0);
297 }
298 
299 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
300 
301 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
302 {
303   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
304   IS             isrow = b->row,iscol = b->col;
305   PetscBool      row_identity,col_identity;
306   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
311   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
312   B->offloadmask = PETSC_OFFLOAD_CPU;
313   /* determine which version of MatSolve needs to be used. */
314   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
315   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
316   if (row_identity && col_identity) {
317     if (!cusparsestruct->use_cpu_solve) {
318       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
319       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
320     }
321     B->ops->matsolve = NULL;
322     B->ops->matsolvetranspose = NULL;
323   } else {
324     if (!cusparsestruct->use_cpu_solve) {
325       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
326       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
327     }
328     B->ops->matsolve = NULL;
329     B->ops->matsolvetranspose = NULL;
330   }
331 
332   /* get the triangular factors */
333   if (!cusparsestruct->use_cpu_solve) {
334     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
340 {
341   PetscErrorCode           ierr;
342   MatCUSPARSEStorageFormat format;
343   PetscBool                flg;
344   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
345 
346   PetscFunctionBegin;
347   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
348   if (A->factortype == MAT_FACTOR_NONE) {
349     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
350                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
351     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
352 
353     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
354                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
355     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
356     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
357     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
359     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
360                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
361     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
362 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
363     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
364 #else
365     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
366 #endif
367     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
368                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
369     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
370 
371     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
372                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
373     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
374    #endif
375   }
376   ierr = PetscOptionsTail();CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
381 {
382   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
383   PetscErrorCode               ierr;
384 
385   PetscFunctionBegin;
386   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
387   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
388   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
389   PetscFunctionReturn(0);
390 }
391 
392 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
393 {
394   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
395   PetscErrorCode               ierr;
396 
397   PetscFunctionBegin;
398   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
399   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
400   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
401   PetscFunctionReturn(0);
402 }
403 
404 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
405 {
406   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
407   PetscErrorCode               ierr;
408 
409   PetscFunctionBegin;
410   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
411   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
412   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
413   PetscFunctionReturn(0);
414 }
415 
416 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
417 {
418   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
419   PetscErrorCode               ierr;
420 
421   PetscFunctionBegin;
422   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
423   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
424   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
425   PetscFunctionReturn(0);
426 }
427 
428 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
429 {
430   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
431   PetscInt                          n = A->rmap->n;
432   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
433   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
434   cusparseStatus_t                  stat;
435   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
436   const MatScalar                   *aa = a->a,*v;
437   PetscInt                          *AiLo, *AjLo;
438   PetscInt                          i,nz, nzLower, offset, rowOffset;
439   PetscErrorCode                    ierr;
440   cudaError_t                       cerr;
441 
442   PetscFunctionBegin;
443   if (!n) PetscFunctionReturn(0);
444   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
445     try {
446       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
447       nzLower=n+ai[n]-ai[1];
448       if (!loTriFactor) {
449         PetscScalar                       *AALo;
450 
451         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
452 
453         /* Allocate Space for the lower triangular matrix */
454         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
455         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
456 
457         /* Fill the lower triangular matrix */
458         AiLo[0]  = (PetscInt) 0;
459         AiLo[n]  = nzLower;
460         AjLo[0]  = (PetscInt) 0;
461         AALo[0]  = (MatScalar) 1.0;
462         v        = aa;
463         vi       = aj;
464         offset   = 1;
465         rowOffset= 1;
466         for (i=1; i<n; i++) {
467           nz = ai[i+1] - ai[i];
468           /* additional 1 for the term on the diagonal */
469           AiLo[i]    = rowOffset;
470           rowOffset += nz+1;
471 
472           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
473           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
474 
475           offset      += nz;
476           AjLo[offset] = (PetscInt) i;
477           AALo[offset] = (MatScalar) 1.0;
478           offset      += 1;
479 
480           v  += nz;
481           vi += nz;
482         }
483 
484         /* allocate space for the triangular factor information */
485         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
486         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
487         /* Create the matrix description */
488         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
489         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
490        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
491         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
492        #else
493         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
494        #endif
495         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
496         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
497 
498         /* set the operation */
499         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
500 
501         /* set the matrix */
502         loTriFactor->csrMat = new CsrMatrix;
503         loTriFactor->csrMat->num_rows = n;
504         loTriFactor->csrMat->num_cols = n;
505         loTriFactor->csrMat->num_entries = nzLower;
506 
507         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
508         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
509 
510         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
511         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
512 
513         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
514         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
515 
516         /* Create the solve analysis information */
517         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
518         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
519       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
520         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
521                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
522                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
523                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
524                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
525         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
526       #endif
527 
528         /* perform the solve analysis */
529         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
530                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
531                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
532                                  loTriFactor->csrMat->column_indices->data().get(),
533                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
534                                  loTriFactor->solveInfo,
535                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
536                                #else
537                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
538                                #endif
539         cerr = WaitForCUDA();CHKERRCUDA(cerr);
540         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
541 
542         /* assign the pointer */
543         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
544         loTriFactor->AA_h = AALo;
545         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
546         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
547         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
548       } else { /* update values only */
549         if (!loTriFactor->AA_h) {
550           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
551         }
552         /* Fill the lower triangular matrix */
553         loTriFactor->AA_h[0]  = 1.0;
554         v        = aa;
555         vi       = aj;
556         offset   = 1;
557         for (i=1; i<n; i++) {
558           nz = ai[i+1] - ai[i];
559           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
560           offset      += nz;
561           loTriFactor->AA_h[offset] = 1.0;
562           offset      += 1;
563           v  += nz;
564         }
565         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
566         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
567       }
568     } catch(char *ex) {
569       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
570     }
571   }
572   PetscFunctionReturn(0);
573 }
574 
575 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
576 {
577   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
578   PetscInt                          n = A->rmap->n;
579   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
580   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
581   cusparseStatus_t                  stat;
582   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
583   const MatScalar                   *aa = a->a,*v;
584   PetscInt                          *AiUp, *AjUp;
585   PetscInt                          i,nz, nzUpper, offset;
586   PetscErrorCode                    ierr;
587   cudaError_t                       cerr;
588 
589   PetscFunctionBegin;
590   if (!n) PetscFunctionReturn(0);
591   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
592     try {
593       /* next, figure out the number of nonzeros in the upper triangular matrix. */
594       nzUpper = adiag[0]-adiag[n];
595       if (!upTriFactor) {
596         PetscScalar *AAUp;
597 
598         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
599 
600         /* Allocate Space for the upper triangular matrix */
601         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
602         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
603 
604         /* Fill the upper triangular matrix */
605         AiUp[0]=(PetscInt) 0;
606         AiUp[n]=nzUpper;
607         offset = nzUpper;
608         for (i=n-1; i>=0; i--) {
609           v  = aa + adiag[i+1] + 1;
610           vi = aj + adiag[i+1] + 1;
611 
612           /* number of elements NOT on the diagonal */
613           nz = adiag[i] - adiag[i+1]-1;
614 
615           /* decrement the offset */
616           offset -= (nz+1);
617 
618           /* first, set the diagonal elements */
619           AjUp[offset] = (PetscInt) i;
620           AAUp[offset] = (MatScalar)1./v[nz];
621           AiUp[i]      = AiUp[i+1] - (nz+1);
622 
623           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
624           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
625         }
626 
627         /* allocate space for the triangular factor information */
628         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
629         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
630 
631         /* Create the matrix description */
632         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
633         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
634        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
635         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
636        #else
637         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
638        #endif
639         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
640         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
641 
642         /* set the operation */
643         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
644 
645         /* set the matrix */
646         upTriFactor->csrMat = new CsrMatrix;
647         upTriFactor->csrMat->num_rows = n;
648         upTriFactor->csrMat->num_cols = n;
649         upTriFactor->csrMat->num_entries = nzUpper;
650 
651         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
652         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
653 
654         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
655         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
656 
657         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
658         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
659 
660         /* Create the solve analysis information */
661         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
662         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
663       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
664         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
665                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
666                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
667                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
668                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
669         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
670       #endif
671 
672         /* perform the solve analysis */
673         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
674                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
675                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
676                                  upTriFactor->csrMat->column_indices->data().get(),
677                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
678                                  upTriFactor->solveInfo,
679                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
680                                #else
681                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
682                                #endif
683         cerr = WaitForCUDA();CHKERRCUDA(cerr);
684         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
685 
686         /* assign the pointer */
687         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
688         upTriFactor->AA_h = AAUp;
689         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
690         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
691         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
692       } else {
693         if (!upTriFactor->AA_h) {
694           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
695         }
696         /* Fill the upper triangular matrix */
697         offset = nzUpper;
698         for (i=n-1; i>=0; i--) {
699           v  = aa + adiag[i+1] + 1;
700 
701           /* number of elements NOT on the diagonal */
702           nz = adiag[i] - adiag[i+1]-1;
703 
704           /* decrement the offset */
705           offset -= (nz+1);
706 
707           /* first, set the diagonal elements */
708           upTriFactor->AA_h[offset] = 1./v[nz];
709           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
710         }
711         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
712         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
713       }
714     } catch(char *ex) {
715       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
716     }
717   }
718   PetscFunctionReturn(0);
719 }
720 
721 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
722 {
723   PetscErrorCode               ierr;
724   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
725   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
726   IS                           isrow = a->row,iscol = a->icol;
727   PetscBool                    row_identity,col_identity;
728   PetscInt                     n = A->rmap->n;
729 
730   PetscFunctionBegin;
731   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
732   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
733   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
734 
735   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
736   cusparseTriFactors->nnz=a->nz;
737 
738   A->offloadmask = PETSC_OFFLOAD_BOTH;
739   /* lower triangular indices */
740   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
741   if (!row_identity && !cusparseTriFactors->rpermIndices) {
742     const PetscInt *r;
743 
744     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
745     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
746     cusparseTriFactors->rpermIndices->assign(r, r+n);
747     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
748     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
749   }
750 
751   /* upper triangular indices */
752   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
753   if (!col_identity && !cusparseTriFactors->cpermIndices) {
754     const PetscInt *c;
755 
756     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
757     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
758     cusparseTriFactors->cpermIndices->assign(c, c+n);
759     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
760     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
761   }
762   PetscFunctionReturn(0);
763 }
764 
765 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
766 {
767   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
768   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
770   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
771   cusparseStatus_t                  stat;
772   PetscErrorCode                    ierr;
773   cudaError_t                       cerr;
774   PetscInt                          *AiUp, *AjUp;
775   PetscScalar                       *AAUp;
776   PetscScalar                       *AALo;
777   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
778   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
779   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
780   const MatScalar                   *aa = b->a,*v;
781 
782   PetscFunctionBegin;
783   if (!n) PetscFunctionReturn(0);
784   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
785     try {
786       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
787       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
788       if (!upTriFactor && !loTriFactor) {
789         /* Allocate Space for the upper triangular matrix */
790         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
791         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
792 
793         /* Fill the upper triangular matrix */
794         AiUp[0]=(PetscInt) 0;
795         AiUp[n]=nzUpper;
796         offset = 0;
797         for (i=0; i<n; i++) {
798           /* set the pointers */
799           v  = aa + ai[i];
800           vj = aj + ai[i];
801           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
802 
803           /* first, set the diagonal elements */
804           AjUp[offset] = (PetscInt) i;
805           AAUp[offset] = (MatScalar)1.0/v[nz];
806           AiUp[i]      = offset;
807           AALo[offset] = (MatScalar)1.0/v[nz];
808 
809           offset+=1;
810           if (nz>0) {
811             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
812             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
813             for (j=offset; j<offset+nz; j++) {
814               AAUp[j] = -AAUp[j];
815               AALo[j] = AAUp[j]/v[nz];
816             }
817             offset+=nz;
818           }
819         }
820 
821         /* allocate space for the triangular factor information */
822         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
823         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
824 
825         /* Create the matrix description */
826         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
827         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
828        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
829         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
830        #else
831         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
832        #endif
833         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
834         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
835 
836         /* set the matrix */
837         upTriFactor->csrMat = new CsrMatrix;
838         upTriFactor->csrMat->num_rows = A->rmap->n;
839         upTriFactor->csrMat->num_cols = A->cmap->n;
840         upTriFactor->csrMat->num_entries = a->nz;
841 
842         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
843         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
844 
845         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
846         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
847 
848         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
849         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
850 
851         /* set the operation */
852         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
853 
854         /* Create the solve analysis information */
855         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
856         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
857       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
858         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
859                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
860                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
861                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
862                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
863         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
864       #endif
865 
866         /* perform the solve analysis */
867         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
868                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
869                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
870                                  upTriFactor->csrMat->column_indices->data().get(),
871                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
872                                  upTriFactor->solveInfo,
873                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
874                                 #else
875                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
876                                 #endif
877         cerr = WaitForCUDA();CHKERRCUDA(cerr);
878         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
879 
880         /* assign the pointer */
881         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
882 
883         /* allocate space for the triangular factor information */
884         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
885         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
886 
887         /* Create the matrix description */
888         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
889         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
890        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
891         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
892        #else
893         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
894        #endif
895         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
896         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
897 
898         /* set the operation */
899         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
900 
901         /* set the matrix */
902         loTriFactor->csrMat = new CsrMatrix;
903         loTriFactor->csrMat->num_rows = A->rmap->n;
904         loTriFactor->csrMat->num_cols = A->cmap->n;
905         loTriFactor->csrMat->num_entries = a->nz;
906 
907         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
908         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
909 
910         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
911         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
912 
913         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
914         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
915 
916         /* Create the solve analysis information */
917         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
918         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
919       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
920         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
921                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
922                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
923                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
924                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
925         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
926       #endif
927 
928         /* perform the solve analysis */
929         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
930                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
931                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
932                                  loTriFactor->csrMat->column_indices->data().get(),
933                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
934                                  loTriFactor->solveInfo,
935                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
936                                 #else
937                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
938                                 #endif
939         cerr = WaitForCUDA();CHKERRCUDA(cerr);
940         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
941 
942         /* assign the pointer */
943         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
944 
945         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
946         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
947         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
948       } else {
949         /* Fill the upper triangular matrix */
950         offset = 0;
951         for (i=0; i<n; i++) {
952           /* set the pointers */
953           v  = aa + ai[i];
954           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
955 
956           /* first, set the diagonal elements */
957           AAUp[offset] = 1.0/v[nz];
958           AALo[offset] = 1.0/v[nz];
959 
960           offset+=1;
961           if (nz>0) {
962             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
963             for (j=offset; j<offset+nz; j++) {
964               AAUp[j] = -AAUp[j];
965               AALo[j] = AAUp[j]/v[nz];
966             }
967             offset+=nz;
968           }
969         }
970         PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
971         PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
972         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
973         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
974         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
975       }
976       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
977       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
978     } catch(char *ex) {
979       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
980     }
981   }
982   PetscFunctionReturn(0);
983 }
984 
985 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
986 {
987   PetscErrorCode               ierr;
988   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
989   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
990   IS                           ip = a->row;
991   PetscBool                    perm_identity;
992   PetscInt                     n = A->rmap->n;
993 
994   PetscFunctionBegin;
995   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
996   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
997   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
998   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
999 
1000   A->offloadmask = PETSC_OFFLOAD_BOTH;
1001 
1002   /* lower triangular indices */
1003   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1004   if (!perm_identity) {
1005     IS             iip;
1006     const PetscInt *irip,*rip;
1007 
1008     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1009     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1010     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1011     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1012     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1013     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1014     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1015     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1016     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1017     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1018     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1019   }
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1024 {
1025   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1026   IS             ip = b->row;
1027   PetscBool      perm_identity;
1028   PetscErrorCode ierr;
1029 
1030   PetscFunctionBegin;
1031   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1032   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1033   B->offloadmask = PETSC_OFFLOAD_CPU;
1034   /* determine which version of MatSolve needs to be used. */
1035   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1036   if (perm_identity) {
1037     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1038     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1039     B->ops->matsolve = NULL;
1040     B->ops->matsolvetranspose = NULL;
1041   } else {
1042     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1043     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1044     B->ops->matsolve = NULL;
1045     B->ops->matsolvetranspose = NULL;
1046   }
1047 
1048   /* get the triangular factors */
1049   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1054 {
1055   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1056   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1057   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1058   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1059   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1060   cusparseStatus_t                  stat;
1061   cusparseIndexBase_t               indexBase;
1062   cusparseMatrixType_t              matrixType;
1063   cusparseFillMode_t                fillMode;
1064   cusparseDiagType_t                diagType;
1065   cudaError_t                       cerr;
1066   PetscErrorCode                    ierr;
1067 
1068   PetscFunctionBegin;
1069   /* allocate space for the transpose of the lower triangular factor */
1070   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1071   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1072 
1073   /* set the matrix descriptors of the lower triangular factor */
1074   matrixType = cusparseGetMatType(loTriFactor->descr);
1075   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1076   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1077     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1078   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1079 
1080   /* Create the matrix description */
1081   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1082   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1083   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1084   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1085   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1086 
1087   /* set the operation */
1088   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1089 
1090   /* allocate GPU space for the CSC of the lower triangular factor*/
1091   loTriFactorT->csrMat = new CsrMatrix;
1092   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1093   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1094   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1095   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1096   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1097   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1098 
1099   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1100 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1101   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1102                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1103                                        loTriFactor->csrMat->values->data().get(),
1104                                        loTriFactor->csrMat->row_offsets->data().get(),
1105                                        loTriFactor->csrMat->column_indices->data().get(),
1106                                        loTriFactorT->csrMat->values->data().get(),
1107                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1108                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1109                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1110   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1111 #endif
1112 
1113   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1114   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1115                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1116                           loTriFactor->csrMat->values->data().get(),
1117                           loTriFactor->csrMat->row_offsets->data().get(),
1118                           loTriFactor->csrMat->column_indices->data().get(),
1119                           loTriFactorT->csrMat->values->data().get(),
1120                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1121                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1122                           CUSPARSE_ACTION_NUMERIC, indexBase,
1123                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1124                         #else
1125                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1126                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1127                         #endif
1128   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1129   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1130 
1131   /* Create the solve analysis information */
1132   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1133   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1134 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1135   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1136                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1137                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1138                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1139                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1140   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1141 #endif
1142 
1143   /* perform the solve analysis */
1144   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1145                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1146                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1147                            loTriFactorT->csrMat->column_indices->data().get(),
1148                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1149                            loTriFactorT->solveInfo,
1150                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1151                           #else
1152                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1153                           #endif
1154   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1155   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1156 
1157   /* assign the pointer */
1158   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1159 
1160   /*********************************************/
1161   /* Now the Transpose of the Upper Tri Factor */
1162   /*********************************************/
1163 
1164   /* allocate space for the transpose of the upper triangular factor */
1165   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1166   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1167 
1168   /* set the matrix descriptors of the upper triangular factor */
1169   matrixType = cusparseGetMatType(upTriFactor->descr);
1170   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1171   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1172     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1173   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1174 
1175   /* Create the matrix description */
1176   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1177   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1178   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1179   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1180   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1181 
1182   /* set the operation */
1183   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1184 
1185   /* allocate GPU space for the CSC of the upper triangular factor*/
1186   upTriFactorT->csrMat = new CsrMatrix;
1187   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1188   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1189   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1190   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1191   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1192   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1193 
1194   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1195 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1196   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1197                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1198                                 upTriFactor->csrMat->values->data().get(),
1199                                 upTriFactor->csrMat->row_offsets->data().get(),
1200                                 upTriFactor->csrMat->column_indices->data().get(),
1201                                 upTriFactorT->csrMat->values->data().get(),
1202                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1203                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1204                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1205   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1206 #endif
1207 
1208   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1209   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1210                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1211                           upTriFactor->csrMat->values->data().get(),
1212                           upTriFactor->csrMat->row_offsets->data().get(),
1213                           upTriFactor->csrMat->column_indices->data().get(),
1214                           upTriFactorT->csrMat->values->data().get(),
1215                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1216                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1217                           CUSPARSE_ACTION_NUMERIC, indexBase,
1218                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1219                         #else
1220                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1221                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1222                         #endif
1223 
1224   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1225   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1226 
1227   /* Create the solve analysis information */
1228   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1229   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1230   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1231   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1232                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1233                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1234                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1235                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1236   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1237   #endif
1238 
1239   /* perform the solve analysis */
1240   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1241                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1242                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1243                            upTriFactorT->csrMat->column_indices->data().get(),
1244                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1245                            upTriFactorT->solveInfo,
1246                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1247                           #else
1248                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1249                           #endif
1250 
1251   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1252   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1253 
1254   /* assign the pointer */
1255   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1256   PetscFunctionReturn(0);
1257 }
1258 
1259 struct PetscScalarToPetscInt
1260 {
1261   __host__ __device__
1262   PetscInt operator()(PetscScalar s)
1263   {
1264     return (PetscInt)PetscRealPart(s);
1265   }
1266 };
1267 
1268 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1269 {
1270   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1271   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1272   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1273   cusparseStatus_t             stat;
1274   cusparseIndexBase_t          indexBase;
1275   cudaError_t                  err;
1276   PetscErrorCode               ierr;
1277 
1278   PetscFunctionBegin;
1279   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1280   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1281   PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1282   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1283   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1284   if (A->transupdated) PetscFunctionReturn(0);
1285   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1286   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1287   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1288     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1289   }
1290   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1291     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1292     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1293     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1294     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1295     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1296 
1297     /* set alpha and beta */
1298     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1299     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1300     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1301     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1302     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1303     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1304 
1305     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1306       CsrMatrix *matrixT = new CsrMatrix;
1307       matstructT->mat = matrixT;
1308       matrixT->num_rows = A->cmap->n;
1309       matrixT->num_cols = A->rmap->n;
1310       matrixT->num_entries = a->nz;
1311       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1312       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1313       matrixT->values = new THRUSTARRAY(a->nz);
1314 
1315       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1316       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1317 
1318      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1319       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1320         stat = cusparseCreateCsr(&matstructT->matDescr,
1321                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1322                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1323                                matrixT->values->data().get(),
1324                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1325                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1326       #else
1327         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1328            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1329 
1330            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1331            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1332            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1333         */
1334         if (matrixT->num_entries) {
1335           stat = cusparseCreateCsr(&matstructT->matDescr,
1336                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1337                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1338                                  matrixT->values->data().get(),
1339                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1340                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1341 
1342         } else {
1343           matstructT->matDescr = NULL;
1344           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1345         }
1346       #endif
1347      #endif
1348     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1349    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1350       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1351    #else
1352       CsrMatrix *temp  = new CsrMatrix;
1353       CsrMatrix *tempT = new CsrMatrix;
1354       /* First convert HYB to CSR */
1355       temp->num_rows = A->rmap->n;
1356       temp->num_cols = A->cmap->n;
1357       temp->num_entries = a->nz;
1358       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1359       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1360       temp->values = new THRUSTARRAY(a->nz);
1361 
1362       stat = cusparse_hyb2csr(cusparsestruct->handle,
1363                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1364                               temp->values->data().get(),
1365                               temp->row_offsets->data().get(),
1366                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1367 
1368       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1369       tempT->num_rows = A->rmap->n;
1370       tempT->num_cols = A->cmap->n;
1371       tempT->num_entries = a->nz;
1372       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1373       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1374       tempT->values = new THRUSTARRAY(a->nz);
1375 
1376       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1377                               temp->num_cols, temp->num_entries,
1378                               temp->values->data().get(),
1379                               temp->row_offsets->data().get(),
1380                               temp->column_indices->data().get(),
1381                               tempT->values->data().get(),
1382                               tempT->column_indices->data().get(),
1383                               tempT->row_offsets->data().get(),
1384                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1385 
1386       /* Last, convert CSC to HYB */
1387       cusparseHybMat_t hybMat;
1388       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1389       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1390         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1391       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1392                               matstructT->descr, tempT->values->data().get(),
1393                               tempT->row_offsets->data().get(),
1394                               tempT->column_indices->data().get(),
1395                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1396 
1397       /* assign the pointer */
1398       matstructT->mat = hybMat;
1399       A->transupdated = PETSC_TRUE;
1400       /* delete temporaries */
1401       if (tempT) {
1402         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1403         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1404         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1405         delete (CsrMatrix*) tempT;
1406       }
1407       if (temp) {
1408         if (temp->values) delete (THRUSTARRAY*) temp->values;
1409         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1410         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1411         delete (CsrMatrix*) temp;
1412       }
1413      #endif
1414     }
1415   }
1416   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1417     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1418     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1419     PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1420     PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1421     PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1422     PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1423     PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1424     PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1425     PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1426     PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1427     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1428       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1429       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1430       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1431     }
1432     if (!cusparsestruct->csr2csc_i) {
1433       THRUSTARRAY csr2csc_a(matrix->num_entries);
1434       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1435 
1436       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1437      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1438       void   *csr2cscBuffer;
1439       size_t csr2cscBufferSize;
1440       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1441                                            A->cmap->n, matrix->num_entries,
1442                                            matrix->values->data().get(),
1443                                            cusparsestruct->rowoffsets_gpu->data().get(),
1444                                            matrix->column_indices->data().get(),
1445                                            matrixT->values->data().get(),
1446                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1447                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1448                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1449       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1450      #endif
1451 
1452       if (matrix->num_entries) {
1453         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1454            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1455            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1456 
1457            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1458            should be filled with indexBase. So I just take a shortcut here.
1459         */
1460         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1461                               A->cmap->n,matrix->num_entries,
1462                               csr2csc_a.data().get(),
1463                               cusparsestruct->rowoffsets_gpu->data().get(),
1464                               matrix->column_indices->data().get(),
1465                               matrixT->values->data().get(),
1466                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1467                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1468                               CUSPARSE_ACTION_NUMERIC,indexBase,
1469                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1470                              #else
1471                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1472                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1473                              #endif
1474       } else {
1475         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1476       }
1477 
1478       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1479       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1480      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1481       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1482      #endif
1483     }
1484     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1485                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1486                                                      matrixT->values->begin()));
1487   }
1488   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1489   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1490   /* the compressed row indices is not used for matTranspose */
1491   matstructT->cprowIndices = NULL;
1492   /* assign the pointer */
1493   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1494   A->transupdated = PETSC_TRUE;
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1499 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1500 {
1501   PetscInt                              n = xx->map->n;
1502   const PetscScalar                     *barray;
1503   PetscScalar                           *xarray;
1504   thrust::device_ptr<const PetscScalar> bGPU;
1505   thrust::device_ptr<PetscScalar>       xGPU;
1506   cusparseStatus_t                      stat;
1507   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511   PetscErrorCode                        ierr;
1512 
1513   PetscFunctionBegin;
1514   /* Analyze the matrix and create the transpose ... on the fly */
1515   if (!loTriFactorT && !upTriFactorT) {
1516     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1517     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1518     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1519   }
1520 
1521   /* Get the GPU pointers */
1522   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1523   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1524   xGPU = thrust::device_pointer_cast(xarray);
1525   bGPU = thrust::device_pointer_cast(barray);
1526 
1527   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1528   /* First, reorder with the row permutation */
1529   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1530                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1531                xGPU);
1532 
1533   /* First, solve U */
1534   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1535                         upTriFactorT->csrMat->num_rows,
1536                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537                         upTriFactorT->csrMat->num_entries,
1538                       #endif
1539                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1540                         upTriFactorT->csrMat->values->data().get(),
1541                         upTriFactorT->csrMat->row_offsets->data().get(),
1542                         upTriFactorT->csrMat->column_indices->data().get(),
1543                         upTriFactorT->solveInfo,
1544                         xarray,
1545                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546                         tempGPU->data().get(),
1547                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1548                       #else
1549                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1550                       #endif
1551 
1552   /* Then, solve L */
1553   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1554                         loTriFactorT->csrMat->num_rows,
1555                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1556                         loTriFactorT->csrMat->num_entries,
1557                       #endif
1558                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1559                         loTriFactorT->csrMat->values->data().get(),
1560                         loTriFactorT->csrMat->row_offsets->data().get(),
1561                         loTriFactorT->csrMat->column_indices->data().get(),
1562                         loTriFactorT->solveInfo,
1563                         tempGPU->data().get(),
1564                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1565                         xarray,
1566                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1567                       #else
1568                          xarray);CHKERRCUSPARSE(stat);
1569                       #endif
1570 
1571   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1572   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1573                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1574                tempGPU->begin());
1575 
1576   /* Copy the temporary to the full solution. */
1577   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1578 
1579   /* restore */
1580   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1581   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1582   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1583   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1588 {
1589   const PetscScalar                 *barray;
1590   PetscScalar                       *xarray;
1591   cusparseStatus_t                  stat;
1592   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1594   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1595   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1596   PetscErrorCode                    ierr;
1597 
1598   PetscFunctionBegin;
1599   /* Analyze the matrix and create the transpose ... on the fly */
1600   if (!loTriFactorT && !upTriFactorT) {
1601     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1602     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1603     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1604   }
1605 
1606   /* Get the GPU pointers */
1607   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1608   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1609 
1610   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1611   /* First, solve U */
1612   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1613                         upTriFactorT->csrMat->num_rows,
1614                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615                         upTriFactorT->csrMat->num_entries,
1616                       #endif
1617                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1618                         upTriFactorT->csrMat->values->data().get(),
1619                         upTriFactorT->csrMat->row_offsets->data().get(),
1620                         upTriFactorT->csrMat->column_indices->data().get(),
1621                         upTriFactorT->solveInfo,
1622                         barray,
1623                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624                         tempGPU->data().get(),
1625                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1626                       #else
1627                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1628                       #endif
1629 
1630   /* Then, solve L */
1631   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1632                         loTriFactorT->csrMat->num_rows,
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         loTriFactorT->csrMat->num_entries,
1635                       #endif
1636                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1637                         loTriFactorT->csrMat->values->data().get(),
1638                         loTriFactorT->csrMat->row_offsets->data().get(),
1639                         loTriFactorT->csrMat->column_indices->data().get(),
1640                         loTriFactorT->solveInfo,
1641                         tempGPU->data().get(),
1642                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1643                         xarray,
1644                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1645                       #else
1646                         xarray);CHKERRCUSPARSE(stat);
1647                       #endif
1648 
1649   /* restore */
1650   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1651   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1652   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1653   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1658 {
1659   const PetscScalar                     *barray;
1660   PetscScalar                           *xarray;
1661   thrust::device_ptr<const PetscScalar> bGPU;
1662   thrust::device_ptr<PetscScalar>       xGPU;
1663   cusparseStatus_t                      stat;
1664   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1665   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1666   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1667   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1668   PetscErrorCode                        ierr;
1669 
1670   PetscFunctionBegin;
1671 
1672   /* Get the GPU pointers */
1673   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1674   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1675   xGPU = thrust::device_pointer_cast(xarray);
1676   bGPU = thrust::device_pointer_cast(barray);
1677 
1678   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1679   /* First, reorder with the row permutation */
1680   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1681                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1682                tempGPU->begin());
1683 
1684   /* Next, solve L */
1685   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1686                         loTriFactor->csrMat->num_rows,
1687                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1688                         loTriFactor->csrMat->num_entries,
1689                       #endif
1690                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1691                         loTriFactor->csrMat->values->data().get(),
1692                         loTriFactor->csrMat->row_offsets->data().get(),
1693                         loTriFactor->csrMat->column_indices->data().get(),
1694                         loTriFactor->solveInfo,
1695                         tempGPU->data().get(),
1696                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1697                          xarray,
1698                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1699                       #else
1700                          xarray);CHKERRCUSPARSE(stat);
1701                       #endif
1702 
1703   /* Then, solve U */
1704   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1705                         upTriFactor->csrMat->num_rows,
1706                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1707                         upTriFactor->csrMat->num_entries,
1708                       #endif
1709                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1710                         upTriFactor->csrMat->values->data().get(),
1711                         upTriFactor->csrMat->row_offsets->data().get(),
1712                         upTriFactor->csrMat->column_indices->data().get(),
1713                         upTriFactor->solveInfo,xarray,
1714                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1715                         tempGPU->data().get(),
1716                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1717                       #else
1718                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1719                       #endif
1720 
1721   /* Last, reorder with the column permutation */
1722   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1723                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1724                xGPU);
1725 
1726   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1727   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1728   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1729   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1730   PetscFunctionReturn(0);
1731 }
1732 
1733 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1734 {
1735   const PetscScalar                 *barray;
1736   PetscScalar                       *xarray;
1737   cusparseStatus_t                  stat;
1738   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1739   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1740   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1741   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1742   PetscErrorCode                    ierr;
1743 
1744   PetscFunctionBegin;
1745   /* Get the GPU pointers */
1746   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1747   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1748 
1749   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1750   /* First, solve L */
1751   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1752                         loTriFactor->csrMat->num_rows,
1753                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1754                         loTriFactor->csrMat->num_entries,
1755                       #endif
1756                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1757                         loTriFactor->csrMat->values->data().get(),
1758                         loTriFactor->csrMat->row_offsets->data().get(),
1759                         loTriFactor->csrMat->column_indices->data().get(),
1760                         loTriFactor->solveInfo,
1761                         barray,
1762                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1763                         tempGPU->data().get(),
1764                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1765                       #else
1766                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1767                       #endif
1768 
1769   /* Next, solve U */
1770   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1771                         upTriFactor->csrMat->num_rows,
1772                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1773                         upTriFactor->csrMat->num_entries,
1774                       #endif
1775                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1776                         upTriFactor->csrMat->values->data().get(),
1777                         upTriFactor->csrMat->row_offsets->data().get(),
1778                         upTriFactor->csrMat->column_indices->data().get(),
1779                         upTriFactor->solveInfo,
1780                         tempGPU->data().get(),
1781                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1782                         xarray,
1783                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1784                       #else
1785                         xarray);CHKERRCUSPARSE(stat);
1786                       #endif
1787 
1788   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1789   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1790   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1791   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1792   PetscFunctionReturn(0);
1793 }
1794 
1795 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1796 {
1797   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1798   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1799   cudaError_t        cerr;
1800   PetscErrorCode     ierr;
1801 
1802   PetscFunctionBegin;
1803   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1804     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1805 
1806     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1807     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1808     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1809     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1810     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1811     A->offloadmask = PETSC_OFFLOAD_BOTH;
1812   }
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1817 {
1818   PetscErrorCode ierr;
1819 
1820   PetscFunctionBegin;
1821   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1822   *array = ((Mat_SeqAIJ*)A->data)->a;
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1827 {
1828   PetscFunctionBegin;
1829   A->offloadmask = PETSC_OFFLOAD_CPU;
1830   *array         = NULL;
1831   PetscFunctionReturn(0);
1832 }
1833 
1834 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1835 {
1836   PetscErrorCode ierr;
1837 
1838   PetscFunctionBegin;
1839   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1840   *array = ((Mat_SeqAIJ*)A->data)->a;
1841   PetscFunctionReturn(0);
1842 }
1843 
1844 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1845 {
1846   PetscFunctionBegin;
1847   *array = NULL;
1848   PetscFunctionReturn(0);
1849 }
1850 
1851 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1852 {
1853   PetscFunctionBegin;
1854   *array = ((Mat_SeqAIJ*)A->data)->a;
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1859 {
1860   PetscFunctionBegin;
1861   A->offloadmask = PETSC_OFFLOAD_CPU;
1862   *array         = NULL;
1863   PetscFunctionReturn(0);
1864 }
1865 
1866 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1867 {
1868   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1869   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1870   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1871   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1872   PetscErrorCode               ierr;
1873   cusparseStatus_t             stat;
1874   PetscBool                    both = PETSC_TRUE;
1875   cudaError_t                  err;
1876 
1877   PetscFunctionBegin;
1878   PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1879   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1880     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1881       CsrMatrix *matrix;
1882       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1883 
1884       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1885       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1886       matrix->values->assign(a->a, a->a+a->nz);
1887       err  = WaitForCUDA();CHKERRCUDA(err);
1888       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1889       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1890       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1891     } else {
1892       PetscInt nnz;
1893       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1894       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1895       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1896       delete cusparsestruct->workVector;
1897       delete cusparsestruct->rowoffsets_gpu;
1898       cusparsestruct->workVector = NULL;
1899       cusparsestruct->rowoffsets_gpu = NULL;
1900       try {
1901         if (a->compressedrow.use) {
1902           m    = a->compressedrow.nrows;
1903           ii   = a->compressedrow.i;
1904           ridx = a->compressedrow.rindex;
1905         } else {
1906           m    = A->rmap->n;
1907           ii   = a->i;
1908           ridx = NULL;
1909         }
1910         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1911         PetscCheckFalse(m && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1912         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1913         else nnz = a->nz;
1914 
1915         /* create cusparse matrix */
1916         cusparsestruct->nrows = m;
1917         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1918         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1919         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1920         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1921 
1922         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1923         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1924         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1925         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1926         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1927         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1928         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1929 
1930         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1931         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1932           /* set the matrix */
1933           CsrMatrix *mat= new CsrMatrix;
1934           mat->num_rows = m;
1935           mat->num_cols = A->cmap->n;
1936           mat->num_entries = nnz;
1937           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1938           mat->row_offsets->assign(ii, ii + m+1);
1939 
1940           mat->column_indices = new THRUSTINTARRAY32(nnz);
1941           mat->column_indices->assign(a->j, a->j+nnz);
1942 
1943           mat->values = new THRUSTARRAY(nnz);
1944           if (a->a) mat->values->assign(a->a, a->a+nnz);
1945 
1946           /* assign the pointer */
1947           matstruct->mat = mat;
1948          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1949           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1950             stat = cusparseCreateCsr(&matstruct->matDescr,
1951                                     mat->num_rows, mat->num_cols, mat->num_entries,
1952                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1953                                     mat->values->data().get(),
1954                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1955                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1956           }
1957          #endif
1958         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1959          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1960           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1961          #else
1962           CsrMatrix *mat= new CsrMatrix;
1963           mat->num_rows = m;
1964           mat->num_cols = A->cmap->n;
1965           mat->num_entries = nnz;
1966           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1967           mat->row_offsets->assign(ii, ii + m+1);
1968 
1969           mat->column_indices = new THRUSTINTARRAY32(nnz);
1970           mat->column_indices->assign(a->j, a->j+nnz);
1971 
1972           mat->values = new THRUSTARRAY(nnz);
1973           if (a->a) mat->values->assign(a->a, a->a+nnz);
1974 
1975           cusparseHybMat_t hybMat;
1976           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1977           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1978             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1979           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1980               matstruct->descr, mat->values->data().get(),
1981               mat->row_offsets->data().get(),
1982               mat->column_indices->data().get(),
1983               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1984           /* assign the pointer */
1985           matstruct->mat = hybMat;
1986 
1987           if (mat) {
1988             if (mat->values) delete (THRUSTARRAY*)mat->values;
1989             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1990             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1991             delete (CsrMatrix*)mat;
1992           }
1993          #endif
1994         }
1995 
1996         /* assign the compressed row indices */
1997         if (a->compressedrow.use) {
1998           cusparsestruct->workVector = new THRUSTARRAY(m);
1999           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2000           matstruct->cprowIndices->assign(ridx,ridx+m);
2001           tmp = m;
2002         } else {
2003           cusparsestruct->workVector = NULL;
2004           matstruct->cprowIndices    = NULL;
2005           tmp = 0;
2006         }
2007         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2008 
2009         /* assign the pointer */
2010         cusparsestruct->mat = matstruct;
2011       } catch(char *ex) {
2012         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2013       }
2014       err  = WaitForCUDA();CHKERRCUDA(err);
2015       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2016       cusparsestruct->nonzerostate = A->nonzerostate;
2017     }
2018     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2019   }
2020   PetscFunctionReturn(0);
2021 }
2022 
2023 struct VecCUDAPlusEquals
2024 {
2025   template <typename Tuple>
2026   __host__ __device__
2027   void operator()(Tuple t)
2028   {
2029     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2030   }
2031 };
2032 
2033 struct VecCUDAEquals
2034 {
2035   template <typename Tuple>
2036   __host__ __device__
2037   void operator()(Tuple t)
2038   {
2039     thrust::get<1>(t) = thrust::get<0>(t);
2040   }
2041 };
2042 
2043 struct VecCUDAEqualsReverse
2044 {
2045   template <typename Tuple>
2046   __host__ __device__
2047   void operator()(Tuple t)
2048   {
2049     thrust::get<0>(t) = thrust::get<1>(t);
2050   }
2051 };
2052 
2053 struct MatMatCusparse {
2054   PetscBool             cisdense;
2055   PetscScalar           *Bt;
2056   Mat                   X;
2057   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2058   PetscLogDouble        flops;
2059   CsrMatrix             *Bcsr;
2060 
2061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2062   cusparseSpMatDescr_t  matSpBDescr;
2063   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2064   cusparseDnMatDescr_t  matBDescr;
2065   cusparseDnMatDescr_t  matCDescr;
2066   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2067  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2068   void                  *dBuffer4;
2069   void                  *dBuffer5;
2070  #endif
2071   size_t                mmBufferSize;
2072   void                  *mmBuffer;
2073   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2074   cusparseSpGEMMDescr_t spgemmDesc;
2075 #endif
2076 };
2077 
2078 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2079 {
2080   PetscErrorCode   ierr;
2081   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2082   cudaError_t      cerr;
2083  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2084   cusparseStatus_t stat;
2085  #endif
2086 
2087   PetscFunctionBegin;
2088   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2089   delete mmdata->Bcsr;
2090  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2091   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2092   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2093   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2094   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2095  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2096   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2097   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2098  #endif
2099   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2100   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2101  #endif
2102   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2103   ierr = PetscFree(data);CHKERRQ(ierr);
2104   PetscFunctionReturn(0);
2105 }
2106 
2107 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2108 
2109 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2110 {
2111   Mat_Product                  *product = C->product;
2112   Mat                          A,B;
2113   PetscInt                     m,n,blda,clda;
2114   PetscBool                    flg,biscuda;
2115   Mat_SeqAIJCUSPARSE           *cusp;
2116   cusparseStatus_t             stat;
2117   cusparseOperation_t          opA;
2118   const PetscScalar            *barray;
2119   PetscScalar                  *carray;
2120   PetscErrorCode               ierr;
2121   MatMatCusparse               *mmdata;
2122   Mat_SeqAIJCUSPARSEMultStruct *mat;
2123   CsrMatrix                    *csrmat;
2124 
2125   PetscFunctionBegin;
2126   MatCheckProduct(C,1);
2127   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2128   mmdata = (MatMatCusparse*)product->data;
2129   A    = product->A;
2130   B    = product->B;
2131   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2132   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2133   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2134      Instead of silently accepting the wrong answer, I prefer to raise the error */
2135   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2136   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2137   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2138   switch (product->type) {
2139   case MATPRODUCT_AB:
2140   case MATPRODUCT_PtAP:
2141     mat = cusp->mat;
2142     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2143     m   = A->rmap->n;
2144     n   = B->cmap->n;
2145     break;
2146   case MATPRODUCT_AtB:
2147     if (!A->form_explicit_transpose) {
2148       mat = cusp->mat;
2149       opA = CUSPARSE_OPERATION_TRANSPOSE;
2150     } else {
2151       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2152       mat  = cusp->matTranspose;
2153       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2154     }
2155     m = A->cmap->n;
2156     n = B->cmap->n;
2157     break;
2158   case MATPRODUCT_ABt:
2159   case MATPRODUCT_RARt:
2160     mat = cusp->mat;
2161     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2162     m   = A->rmap->n;
2163     n   = B->rmap->n;
2164     break;
2165   default:
2166     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2167   }
2168   PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2169   csrmat = (CsrMatrix*)mat->mat;
2170   /* if the user passed a CPU matrix, copy the data to the GPU */
2171   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2172   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2173   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2174 
2175   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2176   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2177     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2178     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2179   } else {
2180     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2181     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2182   }
2183 
2184   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2185  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2186   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2187   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2188   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2189     size_t mmBufferSize;
2190     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2191     if (!mmdata->matBDescr) {
2192       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2193       mmdata->Blda = blda;
2194     }
2195 
2196     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2197     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2198       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2199       mmdata->Clda = clda;
2200     }
2201 
2202     if (!mat->matDescr) {
2203       stat = cusparseCreateCsr(&mat->matDescr,
2204                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2205                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2206                                csrmat->values->data().get(),
2207                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2208                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2209     }
2210     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2211                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2212                                    mmdata->matCDescr,cusparse_scalartype,
2213                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2214     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2215       cudaError_t cerr;
2216       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2217       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2218       mmdata->mmBufferSize = mmBufferSize;
2219     }
2220     mmdata->initialized = PETSC_TRUE;
2221   } else {
2222     /* to be safe, always update pointers of the mats */
2223     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2224     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2225     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2226   }
2227 
2228   /* do cusparseSpMM, which supports transpose on B */
2229   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2230                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2231                       mmdata->matCDescr,cusparse_scalartype,
2232                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2233  #else
2234   PetscInt k;
2235   /* cusparseXcsrmm does not support transpose on B */
2236   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2237     cublasHandle_t cublasv2handle;
2238     cublasStatus_t cerr;
2239 
2240     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2241     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2242                        B->cmap->n,B->rmap->n,
2243                        &PETSC_CUSPARSE_ONE ,barray,blda,
2244                        &PETSC_CUSPARSE_ZERO,barray,blda,
2245                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2246     blda = B->cmap->n;
2247     k    = B->cmap->n;
2248   } else {
2249     k    = B->rmap->n;
2250   }
2251 
2252   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2253   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2254                            csrmat->num_entries,mat->alpha_one,mat->descr,
2255                            csrmat->values->data().get(),
2256                            csrmat->row_offsets->data().get(),
2257                            csrmat->column_indices->data().get(),
2258                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2259                            carray,clda);CHKERRCUSPARSE(stat);
2260  #endif
2261   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2262   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2263   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2264   if (product->type == MATPRODUCT_RARt) {
2265     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2266     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2267   } else if (product->type == MATPRODUCT_PtAP) {
2268     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2269     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2270   } else {
2271     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2272   }
2273   if (mmdata->cisdense) {
2274     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2275   }
2276   if (!biscuda) {
2277     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2278   }
2279   PetscFunctionReturn(0);
2280 }
2281 
2282 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2283 {
2284   Mat_Product        *product = C->product;
2285   Mat                A,B;
2286   PetscInt           m,n;
2287   PetscBool          cisdense,flg;
2288   PetscErrorCode     ierr;
2289   MatMatCusparse     *mmdata;
2290   Mat_SeqAIJCUSPARSE *cusp;
2291 
2292   PetscFunctionBegin;
2293   MatCheckProduct(C,1);
2294   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2295   A    = product->A;
2296   B    = product->B;
2297   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2298   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2299   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2300   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2301   switch (product->type) {
2302   case MATPRODUCT_AB:
2303     m = A->rmap->n;
2304     n = B->cmap->n;
2305     break;
2306   case MATPRODUCT_AtB:
2307     m = A->cmap->n;
2308     n = B->cmap->n;
2309     break;
2310   case MATPRODUCT_ABt:
2311     m = A->rmap->n;
2312     n = B->rmap->n;
2313     break;
2314   case MATPRODUCT_PtAP:
2315     m = B->cmap->n;
2316     n = B->cmap->n;
2317     break;
2318   case MATPRODUCT_RARt:
2319     m = B->rmap->n;
2320     n = B->rmap->n;
2321     break;
2322   default:
2323     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2324   }
2325   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2326   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2327   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2328   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2329 
2330   /* product data */
2331   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2332   mmdata->cisdense = cisdense;
2333  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2334   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2335   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2336     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2337   }
2338  #endif
2339   /* for these products we need intermediate storage */
2340   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2341     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2342     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2343     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2344       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2345     } else {
2346       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2347     }
2348   }
2349   C->product->data    = mmdata;
2350   C->product->destroy = MatDestroy_MatMatCusparse;
2351 
2352   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2353   PetscFunctionReturn(0);
2354 }
2355 
2356 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2357 {
2358   Mat_Product                  *product = C->product;
2359   Mat                          A,B;
2360   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2361   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2362   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2363   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2364   PetscBool                    flg;
2365   PetscErrorCode               ierr;
2366   cusparseStatus_t             stat;
2367   cudaError_t                  cerr;
2368   MatProductType               ptype;
2369   MatMatCusparse               *mmdata;
2370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2371   cusparseSpMatDescr_t         BmatSpDescr;
2372 #endif
2373   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2374 
2375   PetscFunctionBegin;
2376   MatCheckProduct(C,1);
2377   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2378   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2379   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2380   mmdata = (MatMatCusparse*)C->product->data;
2381   A = product->A;
2382   B = product->B;
2383   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2384     mmdata->reusesym = PETSC_FALSE;
2385     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2386     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2387     Cmat = Ccusp->mat;
2388     PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2389     Ccsr = (CsrMatrix*)Cmat->mat;
2390     PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2391     goto finalize;
2392   }
2393   if (!c->nz) goto finalize;
2394   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2395   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2396   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2397   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2398   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2399   PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2400   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2401   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2402   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2403   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2404   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2405   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2406   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2407   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2408 
2409   ptype = product->type;
2410   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2411     ptype = MATPRODUCT_AB;
2412     PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2413   }
2414   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2415     ptype = MATPRODUCT_AB;
2416     PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2417   }
2418   switch (ptype) {
2419   case MATPRODUCT_AB:
2420     Amat = Acusp->mat;
2421     Bmat = Bcusp->mat;
2422     break;
2423   case MATPRODUCT_AtB:
2424     Amat = Acusp->matTranspose;
2425     Bmat = Bcusp->mat;
2426     break;
2427   case MATPRODUCT_ABt:
2428     Amat = Acusp->mat;
2429     Bmat = Bcusp->matTranspose;
2430     break;
2431   default:
2432     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2433   }
2434   Cmat = Ccusp->mat;
2435   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2436   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2437   PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2438   Acsr = (CsrMatrix*)Amat->mat;
2439   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2440   Ccsr = (CsrMatrix*)Cmat->mat;
2441   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2442   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2443   PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2444   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2445 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2446   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2447   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2448   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2449     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2450                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2451                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2452                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2453   #else
2454     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2455                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2456                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2457                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2458     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2459                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2460                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2461   #endif
2462 #else
2463   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2464                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2465                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2466                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2467                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2468 #endif
2469   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2470   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2471   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2472   C->offloadmask = PETSC_OFFLOAD_GPU;
2473 finalize:
2474   /* shorter version of MatAssemblyEnd_SeqAIJ */
2475   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2476   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2477   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2478   c->reallocs         = 0;
2479   C->info.mallocs    += 0;
2480   C->info.nz_unneeded = 0;
2481   C->assembled = C->was_assembled = PETSC_TRUE;
2482   C->num_ass++;
2483   PetscFunctionReturn(0);
2484 }
2485 
2486 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2487 {
2488   Mat_Product                  *product = C->product;
2489   Mat                          A,B;
2490   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2491   Mat_SeqAIJ                   *a,*b,*c;
2492   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2493   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2494   PetscInt                     i,j,m,n,k;
2495   PetscBool                    flg;
2496   PetscErrorCode               ierr;
2497   cusparseStatus_t             stat;
2498   cudaError_t                  cerr;
2499   MatProductType               ptype;
2500   MatMatCusparse               *mmdata;
2501   PetscLogDouble               flops;
2502   PetscBool                    biscompressed,ciscompressed;
2503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2504   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2505   cusparseSpMatDescr_t         BmatSpDescr;
2506 #else
2507   int                          cnz;
2508 #endif
2509   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2510 
2511   PetscFunctionBegin;
2512   MatCheckProduct(C,1);
2513   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2514   A    = product->A;
2515   B    = product->B;
2516   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2517   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2518   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2519   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2520   a = (Mat_SeqAIJ*)A->data;
2521   b = (Mat_SeqAIJ*)B->data;
2522   /* product data */
2523   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2524   C->product->data    = mmdata;
2525   C->product->destroy = MatDestroy_MatMatCusparse;
2526 
2527   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2528   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2529   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2530   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2531   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2532   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2533 
2534   ptype = product->type;
2535   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2536     ptype = MATPRODUCT_AB;
2537     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2538   }
2539   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2540     ptype = MATPRODUCT_AB;
2541     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2542   }
2543   biscompressed = PETSC_FALSE;
2544   ciscompressed = PETSC_FALSE;
2545   switch (ptype) {
2546   case MATPRODUCT_AB:
2547     m = A->rmap->n;
2548     n = B->cmap->n;
2549     k = A->cmap->n;
2550     Amat = Acusp->mat;
2551     Bmat = Bcusp->mat;
2552     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2553     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2554     break;
2555   case MATPRODUCT_AtB:
2556     m = A->cmap->n;
2557     n = B->cmap->n;
2558     k = A->rmap->n;
2559     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2560     Amat = Acusp->matTranspose;
2561     Bmat = Bcusp->mat;
2562     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2563     break;
2564   case MATPRODUCT_ABt:
2565     m = A->rmap->n;
2566     n = B->rmap->n;
2567     k = A->cmap->n;
2568     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2569     Amat = Acusp->mat;
2570     Bmat = Bcusp->matTranspose;
2571     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2572     break;
2573   default:
2574     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2575   }
2576 
2577   /* create cusparse matrix */
2578   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2579   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2580   c     = (Mat_SeqAIJ*)C->data;
2581   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2582   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2583   Ccsr  = new CsrMatrix;
2584 
2585   c->compressedrow.use = ciscompressed;
2586   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2587     c->compressedrow.nrows = a->compressedrow.nrows;
2588     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2589     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2590     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2591     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2592     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2593   } else {
2594     c->compressedrow.nrows  = 0;
2595     c->compressedrow.i      = NULL;
2596     c->compressedrow.rindex = NULL;
2597     Ccusp->workVector       = NULL;
2598     Cmat->cprowIndices      = NULL;
2599   }
2600   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2601   Ccusp->mat      = Cmat;
2602   Ccusp->mat->mat = Ccsr;
2603   Ccsr->num_rows    = Ccusp->nrows;
2604   Ccsr->num_cols    = n;
2605   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2606   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2607   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2608   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2609   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2610   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2611   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2612   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2613   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2614   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2615   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2616     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2617     c->nz = 0;
2618     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2619     Ccsr->values = new THRUSTARRAY(c->nz);
2620     goto finalizesym;
2621   }
2622 
2623   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2624   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2625   Acsr = (CsrMatrix*)Amat->mat;
2626   if (!biscompressed) {
2627     Bcsr = (CsrMatrix*)Bmat->mat;
2628 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2629     BmatSpDescr = Bmat->matDescr;
2630 #endif
2631   } else { /* we need to use row offsets for the full matrix */
2632     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2633     Bcsr = new CsrMatrix;
2634     Bcsr->num_rows       = B->rmap->n;
2635     Bcsr->num_cols       = cBcsr->num_cols;
2636     Bcsr->num_entries    = cBcsr->num_entries;
2637     Bcsr->column_indices = cBcsr->column_indices;
2638     Bcsr->values         = cBcsr->values;
2639     if (!Bcusp->rowoffsets_gpu) {
2640       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2641       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2642       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2643     }
2644     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2645     mmdata->Bcsr = Bcsr;
2646 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2647     if (Bcsr->num_rows && Bcsr->num_cols) {
2648       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2649                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2650                                Bcsr->values->data().get(),
2651                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2652                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2653     }
2654     BmatSpDescr = mmdata->matSpBDescr;
2655 #endif
2656   }
2657   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2658   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2659   /* precompute flops count */
2660   if (ptype == MATPRODUCT_AB) {
2661     for (i=0, flops = 0; i<A->rmap->n; i++) {
2662       const PetscInt st = a->i[i];
2663       const PetscInt en = a->i[i+1];
2664       for (j=st; j<en; j++) {
2665         const PetscInt brow = a->j[j];
2666         flops += 2.*(b->i[brow+1] - b->i[brow]);
2667       }
2668     }
2669   } else if (ptype == MATPRODUCT_AtB) {
2670     for (i=0, flops = 0; i<A->rmap->n; i++) {
2671       const PetscInt anzi = a->i[i+1] - a->i[i];
2672       const PetscInt bnzi = b->i[i+1] - b->i[i];
2673       flops += (2.*anzi)*bnzi;
2674     }
2675   } else { /* TODO */
2676     flops = 0.;
2677   }
2678 
2679   mmdata->flops = flops;
2680   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2681 
2682 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2683   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2684   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2685                           NULL, NULL, NULL,
2686                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2687                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2688   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2689  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2690  {
2691   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2692      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2693   */
2694   void*  dBuffer1 = NULL;
2695   void*  dBuffer2 = NULL;
2696   void*  dBuffer3 = NULL;
2697   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2698   size_t bufferSize1 = 0;
2699   size_t bufferSize2 = 0;
2700   size_t bufferSize3 = 0;
2701   size_t bufferSize4 = 0;
2702   size_t bufferSize5 = 0;
2703 
2704   /*----------------------------------------------------------------------*/
2705   /* ask bufferSize1 bytes for external memory */
2706   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2707                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2708                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2709   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2710   /* inspect the matrices A and B to understand the memory requirement for the next step */
2711   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2712                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2713                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2714 
2715   /*----------------------------------------------------------------------*/
2716   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2717                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2718                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2719   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2720   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2721   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2722   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2723                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2724                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2725   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2726   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2727 
2728   /*----------------------------------------------------------------------*/
2729   /* get matrix C non-zero entries C_nnz1 */
2730   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2731   c->nz = (PetscInt) C_nnz1;
2732   /* allocate matrix C */
2733   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2735   /* update matC with the new pointers */
2736   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2737                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2738 
2739   /*----------------------------------------------------------------------*/
2740   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2741                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2742                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2743   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2744   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2745                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2746                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2747   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2748   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2749                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2750                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2751                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2752   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2753  }
2754  #else
2755   size_t bufSize2;
2756   /* ask bufferSize bytes for external memory */
2757   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2758                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2759                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2760                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2761   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2762   /* inspect the matrices A and B to understand the memory requirement for the next step */
2763   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2764                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2765                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2766                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2767   /* ask bufferSize again bytes for external memory */
2768   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2769                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2770                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2771                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2772   /* The CUSPARSE documentation is not clear, nor the API
2773      We need both buffers to perform the operations properly!
2774      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2775      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2776      is stored in the descriptor! What a messy API... */
2777   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2778   /* compute the intermediate product of A * B */
2779   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2780                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2781                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2782                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2783   /* get matrix C non-zero entries C_nnz1 */
2784   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2785   c->nz = (PetscInt) C_nnz1;
2786   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2787   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2788   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2789   Ccsr->values = new THRUSTARRAY(c->nz);
2790   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2791   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2792                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2793   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2794                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2795                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2796  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2797 #else
2798   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2799   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2800                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2801                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2802                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2803                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2804   c->nz = cnz;
2805   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2806   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2807   Ccsr->values = new THRUSTARRAY(c->nz);
2808   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2809 
2810   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2811   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2812      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2813      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2814   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2815                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2816                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2817                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2818                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2819 #endif
2820   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2821   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2822 finalizesym:
2823   c->singlemalloc = PETSC_FALSE;
2824   c->free_a       = PETSC_TRUE;
2825   c->free_ij      = PETSC_TRUE;
2826   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2827   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2828   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2829     PetscInt *d_i = c->i;
2830     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2831     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2832     ii   = *Ccsr->row_offsets;
2833     jj   = *Ccsr->column_indices;
2834     if (ciscompressed) d_i = c->compressedrow.i;
2835     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2836     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2837   } else {
2838     PetscInt *d_i = c->i;
2839     if (ciscompressed) d_i = c->compressedrow.i;
2840     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2841     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2842   }
2843   if (ciscompressed) { /* need to expand host row offsets */
2844     PetscInt r = 0;
2845     c->i[0] = 0;
2846     for (k = 0; k < c->compressedrow.nrows; k++) {
2847       const PetscInt next = c->compressedrow.rindex[k];
2848       const PetscInt old = c->compressedrow.i[k];
2849       for (; r < next; r++) c->i[r+1] = old;
2850     }
2851     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2852   }
2853   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2854   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2855   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2856   c->maxnz = c->nz;
2857   c->nonzerorowcnt = 0;
2858   c->rmax = 0;
2859   for (k = 0; k < m; k++) {
2860     const PetscInt nn = c->i[k+1] - c->i[k];
2861     c->ilen[k] = c->imax[k] = nn;
2862     c->nonzerorowcnt += (PetscInt)!!nn;
2863     c->rmax = PetscMax(c->rmax,nn);
2864   }
2865   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2866   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2867   Ccsr->num_entries = c->nz;
2868 
2869   C->nonzerostate++;
2870   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2871   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2872   Ccusp->nonzerostate = C->nonzerostate;
2873   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2874   C->preallocated  = PETSC_TRUE;
2875   C->assembled     = PETSC_FALSE;
2876   C->was_assembled = PETSC_FALSE;
2877   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2878     mmdata->reusesym = PETSC_TRUE;
2879     C->offloadmask   = PETSC_OFFLOAD_GPU;
2880   }
2881   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2882   PetscFunctionReturn(0);
2883 }
2884 
2885 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2886 
2887 /* handles sparse or dense B */
2888 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2889 {
2890   Mat_Product    *product = mat->product;
2891   PetscErrorCode ierr;
2892   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2893 
2894   PetscFunctionBegin;
2895   MatCheckProduct(mat,1);
2896   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2897   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2898     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2899   }
2900   if (product->type == MATPRODUCT_ABC) {
2901     Ciscusp = PETSC_FALSE;
2902     if (!product->C->boundtocpu) {
2903       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2904     }
2905   }
2906   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2907     PetscBool usecpu = PETSC_FALSE;
2908     switch (product->type) {
2909     case MATPRODUCT_AB:
2910       if (product->api_user) {
2911         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2912         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2913         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2914       } else {
2915         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2916         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2917         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2918       }
2919       break;
2920     case MATPRODUCT_AtB:
2921       if (product->api_user) {
2922         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2923         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2924         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2925       } else {
2926         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2927         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2928         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2929       }
2930       break;
2931     case MATPRODUCT_PtAP:
2932       if (product->api_user) {
2933         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2934         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2935         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2936       } else {
2937         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2938         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2939         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2940       }
2941       break;
2942     case MATPRODUCT_RARt:
2943       if (product->api_user) {
2944         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2945         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2946         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2947       } else {
2948         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2949         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2950         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2951       }
2952       break;
2953     case MATPRODUCT_ABC:
2954       if (product->api_user) {
2955         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2956         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2957         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2958       } else {
2959         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2960         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2961         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2962       }
2963       break;
2964     default:
2965       break;
2966     }
2967     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2968   }
2969   /* dispatch */
2970   if (isdense) {
2971     switch (product->type) {
2972     case MATPRODUCT_AB:
2973     case MATPRODUCT_AtB:
2974     case MATPRODUCT_ABt:
2975     case MATPRODUCT_PtAP:
2976     case MATPRODUCT_RARt:
2977      if (product->A->boundtocpu) {
2978         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2979       } else {
2980         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2981       }
2982       break;
2983     case MATPRODUCT_ABC:
2984       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2985       break;
2986     default:
2987       break;
2988     }
2989   } else if (Biscusp && Ciscusp) {
2990     switch (product->type) {
2991     case MATPRODUCT_AB:
2992     case MATPRODUCT_AtB:
2993     case MATPRODUCT_ABt:
2994       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2995       break;
2996     case MATPRODUCT_PtAP:
2997     case MATPRODUCT_RARt:
2998     case MATPRODUCT_ABC:
2999       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3000       break;
3001     default:
3002       break;
3003     }
3004   } else { /* fallback for AIJ */
3005     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3006   }
3007   PetscFunctionReturn(0);
3008 }
3009 
3010 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3011 {
3012   PetscErrorCode ierr;
3013 
3014   PetscFunctionBegin;
3015   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3016   PetscFunctionReturn(0);
3017 }
3018 
3019 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3020 {
3021   PetscErrorCode ierr;
3022 
3023   PetscFunctionBegin;
3024   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3025   PetscFunctionReturn(0);
3026 }
3027 
3028 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3029 {
3030   PetscErrorCode ierr;
3031 
3032   PetscFunctionBegin;
3033   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3034   PetscFunctionReturn(0);
3035 }
3036 
3037 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3038 {
3039   PetscErrorCode ierr;
3040 
3041   PetscFunctionBegin;
3042   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3043   PetscFunctionReturn(0);
3044 }
3045 
3046 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3047 {
3048   PetscErrorCode ierr;
3049 
3050   PetscFunctionBegin;
3051   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3052   PetscFunctionReturn(0);
3053 }
3054 
3055 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3056 {
3057   int i = blockIdx.x*blockDim.x + threadIdx.x;
3058   if (i < n) y[idx[i]] += x[i];
3059 }
3060 
3061 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3062 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3063 {
3064   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3065   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3066   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3067   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3068   PetscErrorCode               ierr;
3069   cusparseStatus_t             stat;
3070   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3071   PetscBool                    compressed;
3072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3073   PetscInt                     nx,ny;
3074 #endif
3075 
3076   PetscFunctionBegin;
3077   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3078   if (!a->nonzerorowcnt) {
3079     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3080     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3081     PetscFunctionReturn(0);
3082   }
3083   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3084   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3085   if (!trans) {
3086     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3087     PetscCheckFalse(!matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3088   } else {
3089     if (herm || !A->form_explicit_transpose) {
3090       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3091       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3092     } else {
3093       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3094       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3095     }
3096   }
3097   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3098   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3099 
3100   try {
3101     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3102     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3103     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3104 
3105     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3106     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3107       /* z = A x + beta y.
3108          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3109          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3110       */
3111       xptr = xarray;
3112       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3113       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3114      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3115       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3116           allocated to accommodate different uses. So we get the length info directly from mat.
3117        */
3118       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3119         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3120         nx = mat->num_cols;
3121         ny = mat->num_rows;
3122       }
3123      #endif
3124     } else {
3125       /* z = A^T x + beta y
3126          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3127          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3128        */
3129       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3130       dptr = zarray;
3131       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3132       if (compressed) { /* Scatter x to work vector */
3133         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3134         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3135                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3136                          VecCUDAEqualsReverse());
3137       }
3138      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3139       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3140         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3141         nx = mat->num_rows;
3142         ny = mat->num_cols;
3143       }
3144      #endif
3145     }
3146 
3147     /* csr_spmv does y = alpha op(A) x + beta y */
3148     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3149      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3150       PetscCheckFalse(opA < 0 || opA > 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3151       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3152         cudaError_t cerr;
3153         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3154         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3155         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3156                                 matstruct->matDescr,
3157                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3158                                 matstruct->cuSpMV[opA].vecYDescr,
3159                                 cusparse_scalartype,
3160                                 cusparsestruct->spmvAlg,
3161                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3162         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3163 
3164         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3165       } else {
3166         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3167         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3168         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3169       }
3170 
3171       stat = cusparseSpMV(cusparsestruct->handle, opA,
3172                                matstruct->alpha_one,
3173                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3174                                matstruct->cuSpMV[opA].vecXDescr,
3175                                beta,
3176                                matstruct->cuSpMV[opA].vecYDescr,
3177                                cusparse_scalartype,
3178                                cusparsestruct->spmvAlg,
3179                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3180      #else
3181       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3182       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3183                                mat->num_rows, mat->num_cols,
3184                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3185                                mat->values->data().get(), mat->row_offsets->data().get(),
3186                                mat->column_indices->data().get(), xptr, beta,
3187                                dptr);CHKERRCUSPARSE(stat);
3188      #endif
3189     } else {
3190       if (cusparsestruct->nrows) {
3191        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3192         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3193        #else
3194         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3195         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3196                                  matstruct->alpha_one, matstruct->descr, hybMat,
3197                                  xptr, beta,
3198                                  dptr);CHKERRCUSPARSE(stat);
3199        #endif
3200       }
3201     }
3202     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3203 
3204     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3205       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3206         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3207           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3208         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3209           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3210         }
3211       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3212         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3213       }
3214 
3215       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3216       if (compressed) {
3217         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3218         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3219            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3220            prevent that. So I just add a ScatterAdd kernel.
3221          */
3222        #if 0
3223         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3224         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3225                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3226                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3227                          VecCUDAPlusEquals());
3228        #else
3229         PetscInt n = matstruct->cprowIndices->size();
3230         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3231        #endif
3232         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3233       }
3234     } else {
3235       if (yy && yy != zz) {
3236         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3237       }
3238     }
3239     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3240     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3241     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3242   } catch(char *ex) {
3243     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3244   }
3245   if (yy) {
3246     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3247   } else {
3248     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3249   }
3250   PetscFunctionReturn(0);
3251 }
3252 
3253 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3254 {
3255   PetscErrorCode ierr;
3256 
3257   PetscFunctionBegin;
3258   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3259   PetscFunctionReturn(0);
3260 }
3261 
3262 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3263 {
3264   PetscErrorCode     ierr;
3265   PetscObjectState   onnz = A->nonzerostate;
3266   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3267 
3268   PetscFunctionBegin;
3269   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3270   if (onnz != A->nonzerostate && cusp->deviceMat) {
3271     cudaError_t cerr;
3272 
3273     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3274     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3275     cusp->deviceMat = NULL;
3276   }
3277   PetscFunctionReturn(0);
3278 }
3279 
3280 /* --------------------------------------------------------------------------------*/
3281 /*@
3282    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3283    (the default parallel PETSc format). This matrix will ultimately pushed down
3284    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3285    assembly performance the user should preallocate the matrix storage by setting
3286    the parameter nz (or the array nnz).  By setting these parameters accurately,
3287    performance during matrix assembly can be increased by more than a factor of 50.
3288 
3289    Collective
3290 
3291    Input Parameters:
3292 +  comm - MPI communicator, set to PETSC_COMM_SELF
3293 .  m - number of rows
3294 .  n - number of columns
3295 .  nz - number of nonzeros per row (same for all rows)
3296 -  nnz - array containing the number of nonzeros in the various rows
3297          (possibly different for each row) or NULL
3298 
3299    Output Parameter:
3300 .  A - the matrix
3301 
3302    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3303    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3304    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3305 
3306    Notes:
3307    If nnz is given then nz is ignored
3308 
3309    The AIJ format (also called the Yale sparse matrix format or
3310    compressed row storage), is fully compatible with standard Fortran 77
3311    storage.  That is, the stored row and column indices can begin at
3312    either one (as in Fortran) or zero.  See the users' manual for details.
3313 
3314    Specify the preallocated storage with either nz or nnz (not both).
3315    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3316    allocation.  For large problems you MUST preallocate memory or you
3317    will get TERRIBLE performance, see the users' manual chapter on matrices.
3318 
3319    By default, this format uses inodes (identical nodes) when possible, to
3320    improve numerical efficiency of matrix-vector products and solves. We
3321    search for consecutive rows with the same nonzero structure, thereby
3322    reusing matrix information to achieve increased efficiency.
3323 
3324    Level: intermediate
3325 
3326 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3327 @*/
3328 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3329 {
3330   PetscErrorCode ierr;
3331 
3332   PetscFunctionBegin;
3333   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3334   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3335   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3336   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3337   PetscFunctionReturn(0);
3338 }
3339 
3340 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3341 {
3342   PetscErrorCode ierr;
3343 
3344   PetscFunctionBegin;
3345   if (A->factortype == MAT_FACTOR_NONE) {
3346     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3347   } else {
3348     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3349   }
3350   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3351   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3357   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3358   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3359   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3360   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3361   PetscFunctionReturn(0);
3362 }
3363 
3364 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3365 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3366 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3367 {
3368   PetscErrorCode ierr;
3369 
3370   PetscFunctionBegin;
3371   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3372   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3373   PetscFunctionReturn(0);
3374 }
3375 
3376 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3377 {
3378   PetscErrorCode     ierr;
3379   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3380   Mat_SeqAIJCUSPARSE *cy;
3381   Mat_SeqAIJCUSPARSE *cx;
3382   PetscScalar        *ay;
3383   const PetscScalar  *ax;
3384   CsrMatrix          *csry,*csrx;
3385 
3386   PetscFunctionBegin;
3387   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3388   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3389   if (X->ops->axpy != Y->ops->axpy) {
3390     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3391     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3392     PetscFunctionReturn(0);
3393   }
3394   /* if we are here, it means both matrices are bound to GPU */
3395   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3396   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3397   PetscCheckFalse(cy->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3398   PetscCheckFalse(cx->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3399   csry = (CsrMatrix*)cy->mat->mat;
3400   csrx = (CsrMatrix*)cx->mat->mat;
3401   /* see if we can turn this into a cublas axpy */
3402   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3403     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3404     if (eq) {
3405       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3406     }
3407     if (eq) str = SAME_NONZERO_PATTERN;
3408   }
3409   /* spgeam is buggy with one column */
3410   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3411 
3412   if (str == SUBSET_NONZERO_PATTERN) {
3413     cusparseStatus_t stat;
3414     PetscScalar      b = 1.0;
3415 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3416     size_t           bufferSize;
3417     void             *buffer;
3418     cudaError_t      cerr;
3419 #endif
3420 
3421     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3422     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3423     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3425     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3426                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3427                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3428                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3429     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3430     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3431     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3432                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3433                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3434                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3435     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3436     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3437     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3438 #else
3439     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3440     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3441                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3442                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3443                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3444     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3445     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3446 #endif
3447     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3448     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3449     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3450     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3451   } else if (str == SAME_NONZERO_PATTERN) {
3452     cublasHandle_t cublasv2handle;
3453     cublasStatus_t berr;
3454     PetscBLASInt   one = 1, bnz = 1;
3455 
3456     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3457     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3458     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3459     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3460     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3461     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3462     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3463     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3464     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3465     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3466     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3467   } else {
3468     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3469     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3470   }
3471   PetscFunctionReturn(0);
3472 }
3473 
3474 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3475 {
3476   PetscErrorCode ierr;
3477   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3478   PetscScalar    *ay;
3479   cublasHandle_t cublasv2handle;
3480   cublasStatus_t berr;
3481   PetscBLASInt   one = 1, bnz = 1;
3482 
3483   PetscFunctionBegin;
3484   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3485   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3486   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3487   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3488   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3489   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3490   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3491   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3492   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3493   PetscFunctionReturn(0);
3494 }
3495 
3496 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3497 {
3498   PetscErrorCode ierr;
3499   PetscBool      both = PETSC_FALSE;
3500   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3501 
3502   PetscFunctionBegin;
3503   if (A->factortype == MAT_FACTOR_NONE) {
3504     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3505     if (spptr->mat) {
3506       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3507       if (matrix->values) {
3508         both = PETSC_TRUE;
3509         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3510       }
3511     }
3512     if (spptr->matTranspose) {
3513       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3514       if (matrix->values) {
3515         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3516       }
3517     }
3518   }
3519   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3520   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3521   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3522   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3523   else A->offloadmask = PETSC_OFFLOAD_CPU;
3524   PetscFunctionReturn(0);
3525 }
3526 
3527 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3528 {
3529   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3530   PetscErrorCode ierr;
3531 
3532   PetscFunctionBegin;
3533   if (A->factortype != MAT_FACTOR_NONE) {
3534     A->boundtocpu = flg;
3535     PetscFunctionReturn(0);
3536   }
3537   if (flg) {
3538     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3539 
3540     A->ops->scale                     = MatScale_SeqAIJ;
3541     A->ops->axpy                      = MatAXPY_SeqAIJ;
3542     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3543     A->ops->mult                      = MatMult_SeqAIJ;
3544     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3545     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3546     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3547     A->ops->multhermitiantranspose    = NULL;
3548     A->ops->multhermitiantransposeadd = NULL;
3549     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3550     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3551     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3552     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3553     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3554     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3555     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3556     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3557     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3558   } else {
3559     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3560     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3561     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3562     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3563     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3564     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3565     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3566     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3567     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3568     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3569     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3570     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3571     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3572     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3573     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3574     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3575     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3581   }
3582   A->boundtocpu = flg;
3583   if (flg && a->inode.size) {
3584     a->inode.use = PETSC_TRUE;
3585   } else {
3586     a->inode.use = PETSC_FALSE;
3587   }
3588   PetscFunctionReturn(0);
3589 }
3590 
3591 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3592 {
3593   PetscErrorCode   ierr;
3594   cusparseStatus_t stat;
3595   Mat              B;
3596 
3597   PetscFunctionBegin;
3598   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3599   if (reuse == MAT_INITIAL_MATRIX) {
3600     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3601   } else if (reuse == MAT_REUSE_MATRIX) {
3602     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3603   }
3604   B = *newmat;
3605 
3606   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3607   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3608 
3609   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3610     if (B->factortype == MAT_FACTOR_NONE) {
3611       Mat_SeqAIJCUSPARSE *spptr;
3612       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3613       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3614       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3615       spptr->format     = MAT_CUSPARSE_CSR;
3616      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3617      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3618       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3619      #else
3620       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3621      #endif
3622       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3623       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3624      #endif
3625       B->spptr = spptr;
3626     } else {
3627       Mat_SeqAIJCUSPARSETriFactors *spptr;
3628 
3629       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3630       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3631       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3632       B->spptr = spptr;
3633     }
3634     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3635   }
3636   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3637   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3638   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3639   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3640   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3641   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3642 
3643   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3644   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3645   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3646 #if defined(PETSC_HAVE_HYPRE)
3647   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3648 #endif
3649   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3650   PetscFunctionReturn(0);
3651 }
3652 
3653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3654 {
3655   PetscErrorCode ierr;
3656 
3657   PetscFunctionBegin;
3658   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3659   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3660   PetscFunctionReturn(0);
3661 }
3662 
3663 /*MC
3664    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3665 
3666    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3667    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3668    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3669 
3670    Options Database Keys:
3671 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3672 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3673 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3674 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3675 
3676   Level: beginner
3677 
3678 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3679 M*/
3680 
3681 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3682 
3683 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3684 {
3685   PetscErrorCode ierr;
3686 
3687   PetscFunctionBegin;
3688   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3689   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3690   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3691   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3692   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3693 
3694   PetscFunctionReturn(0);
3695 }
3696 
3697 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3698 {
3699   PetscErrorCode   ierr;
3700   cusparseStatus_t stat;
3701   cudaError_t      cerr;
3702 
3703   PetscFunctionBegin;
3704   if (*cusparsestruct) {
3705     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3706     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3707     delete (*cusparsestruct)->workVector;
3708     delete (*cusparsestruct)->rowoffsets_gpu;
3709     delete (*cusparsestruct)->cooPerm;
3710     delete (*cusparsestruct)->cooPerm_a;
3711     delete (*cusparsestruct)->csr2csc_i;
3712     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3713     if ((*cusparsestruct)->use_extended_coo) {
3714       cerr = cudaFree((*cusparsestruct)->jmap_d);CHKERRCUDA(cerr);
3715       cerr = cudaFree((*cusparsestruct)->perm_d);CHKERRCUDA(cerr);
3716     }
3717     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3718   }
3719   PetscFunctionReturn(0);
3720 }
3721 
3722 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3723 {
3724   PetscFunctionBegin;
3725   if (*mat) {
3726     delete (*mat)->values;
3727     delete (*mat)->column_indices;
3728     delete (*mat)->row_offsets;
3729     delete *mat;
3730     *mat = 0;
3731   }
3732   PetscFunctionReturn(0);
3733 }
3734 
3735 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3736 {
3737   cusparseStatus_t stat;
3738   PetscErrorCode   ierr;
3739 
3740   PetscFunctionBegin;
3741   if (*trifactor) {
3742     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3743     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3744     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3745     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3746     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3747    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3748     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3749    #endif
3750     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3751   }
3752   PetscFunctionReturn(0);
3753 }
3754 
3755 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3756 {
3757   CsrMatrix        *mat;
3758   cusparseStatus_t stat;
3759   cudaError_t      err;
3760 
3761   PetscFunctionBegin;
3762   if (*matstruct) {
3763     if ((*matstruct)->mat) {
3764       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3765        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3766         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3767        #else
3768         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3769         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3770        #endif
3771       } else {
3772         mat = (CsrMatrix*)(*matstruct)->mat;
3773         CsrMatrix_Destroy(&mat);
3774       }
3775     }
3776     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3777     delete (*matstruct)->cprowIndices;
3778     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3779     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3780     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3781 
3782    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3783     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3784     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3785     for (int i=0; i<3; i++) {
3786       if (mdata->cuSpMV[i].initialized) {
3787         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3788         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3789         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3790       }
3791     }
3792    #endif
3793     delete *matstruct;
3794     *matstruct = NULL;
3795   }
3796   PetscFunctionReturn(0);
3797 }
3798 
3799 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3800 {
3801   PetscErrorCode ierr;
3802 
3803   PetscFunctionBegin;
3804   if (*trifactors) {
3805     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3806     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3807     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3808     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3809     delete (*trifactors)->rpermIndices;
3810     delete (*trifactors)->cpermIndices;
3811     delete (*trifactors)->workVector;
3812     (*trifactors)->rpermIndices = NULL;
3813     (*trifactors)->cpermIndices = NULL;
3814     (*trifactors)->workVector = NULL;
3815     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3816     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3817     (*trifactors)->init_dev_prop = PETSC_FALSE;
3818   }
3819   PetscFunctionReturn(0);
3820 }
3821 
3822 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3823 {
3824   PetscErrorCode   ierr;
3825   cusparseHandle_t handle;
3826   cusparseStatus_t stat;
3827 
3828   PetscFunctionBegin;
3829   if (*trifactors) {
3830     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3831     if (handle = (*trifactors)->handle) {
3832       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3833     }
3834     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3835   }
3836   PetscFunctionReturn(0);
3837 }
3838 
3839 struct IJCompare
3840 {
3841   __host__ __device__
3842   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3843   {
3844     if (t1.get<0>() < t2.get<0>()) return true;
3845     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3846     return false;
3847   }
3848 };
3849 
3850 struct IJEqual
3851 {
3852   __host__ __device__
3853   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3854   {
3855     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3856     return true;
3857   }
3858 };
3859 
3860 struct IJDiff
3861 {
3862   __host__ __device__
3863   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3864   {
3865     return t1 == t2 ? 0 : 1;
3866   }
3867 };
3868 
3869 struct IJSum
3870 {
3871   __host__ __device__
3872   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3873   {
3874     return t1||t2;
3875   }
3876 };
3877 
3878 #include <thrust/iterator/discard_iterator.h>
3879 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3880 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3881 {
3882   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3883   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3884   THRUSTARRAY                           *cooPerm_v = NULL;
3885   thrust::device_ptr<const PetscScalar> d_v;
3886   CsrMatrix                             *matrix;
3887   PetscErrorCode                        ierr;
3888   PetscInt                              n;
3889 
3890   PetscFunctionBegin;
3891   PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3892   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3893   if (!cusp->cooPerm) {
3894     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3895     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3896     PetscFunctionReturn(0);
3897   }
3898   matrix = (CsrMatrix*)cusp->mat->mat;
3899   PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3900   if (!v) {
3901     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3902     goto finalize;
3903   }
3904   n = cusp->cooPerm->size();
3905   if (isCudaMem(v)) {
3906     d_v = thrust::device_pointer_cast(v);
3907   } else {
3908     cooPerm_v = new THRUSTARRAY(n);
3909     cooPerm_v->assign(v,v+n);
3910     d_v = cooPerm_v->data();
3911     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3912   }
3913   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3914   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3915     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3916       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3917       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3918       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3919         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3920         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3921       */
3922       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3923       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3924       delete cooPerm_w;
3925     } else {
3926       /* all nonzeros in d_v[] are unique entries */
3927       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3928                                                                 matrix->values->begin()));
3929       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3930                                                                 matrix->values->end()));
3931       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3932     }
3933   } else {
3934     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3935       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3936       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3937     } else {
3938       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3939                                                                 matrix->values->begin()));
3940       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3941                                                                 matrix->values->end()));
3942       thrust::for_each(zibit,zieit,VecCUDAEquals());
3943     }
3944   }
3945   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3946 finalize:
3947   delete cooPerm_v;
3948   A->offloadmask = PETSC_OFFLOAD_GPU;
3949   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3950   /* shorter version of MatAssemblyEnd_SeqAIJ */
3951   ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3952   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3953   ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3954   a->reallocs         = 0;
3955   A->info.mallocs    += 0;
3956   A->info.nz_unneeded = 0;
3957   A->assembled = A->was_assembled = PETSC_TRUE;
3958   A->num_ass++;
3959   PetscFunctionReturn(0);
3960 }
3961 
3962 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3963 {
3964   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3965   PetscErrorCode     ierr;
3966 
3967   PetscFunctionBegin;
3968   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3969   if (!cusp) PetscFunctionReturn(0);
3970   if (destroy) {
3971     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3972     delete cusp->csr2csc_i;
3973     cusp->csr2csc_i = NULL;
3974   }
3975   A->transupdated = PETSC_FALSE;
3976   PetscFunctionReturn(0);
3977 }
3978 
3979 #include <thrust/binary_search.h>
3980 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3981 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3982 {
3983   PetscErrorCode     ierr;
3984   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3985   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3986   PetscInt           cooPerm_n, nzr = 0;
3987   cudaError_t        cerr;
3988 
3989   PetscFunctionBegin;
3990   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3991   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3992   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3993   if (n != cooPerm_n) {
3994     delete cusp->cooPerm;
3995     delete cusp->cooPerm_a;
3996     cusp->cooPerm = NULL;
3997     cusp->cooPerm_a = NULL;
3998   }
3999   if (n) {
4000     THRUSTINTARRAY d_i(n);
4001     THRUSTINTARRAY d_j(n);
4002     THRUSTINTARRAY ii(A->rmap->n);
4003 
4004     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4005     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4006 
4007     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
4008     d_i.assign(coo_i,coo_i+n);
4009     d_j.assign(coo_j,coo_j+n);
4010 
4011     /* Ex.
4012       n = 6
4013       coo_i = [3,3,1,4,1,4]
4014       coo_j = [3,2,2,5,2,6]
4015     */
4016     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4017     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4018 
4019     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4020     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4021     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4022     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4023     THRUSTINTARRAY w = d_j;
4024 
4025     /*
4026       d_i     = [1,1,3,3,4,4]
4027       d_j     = [2,2,2,3,5,6]
4028       cooPerm = [2,4,1,0,3,5]
4029     */
4030     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4031 
4032     /*
4033       d_i     = [1,3,3,4,4,x]
4034                             ^ekey
4035       d_j     = [2,2,3,5,6,x]
4036                            ^nekye
4037     */
4038     if (nekey == ekey) { /* all entries are unique */
4039       delete cusp->cooPerm_a;
4040       cusp->cooPerm_a = NULL;
4041     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4042       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4043       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4044       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4045       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4046       w[0] = 0;
4047       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4048       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4049     }
4050     thrust::counting_iterator<PetscInt> search_begin(0);
4051     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4052                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4053                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4054     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4055 
4056     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4057     a->singlemalloc = PETSC_FALSE;
4058     a->free_a       = PETSC_TRUE;
4059     a->free_ij      = PETSC_TRUE;
4060     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4061     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4062     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4063     a->nz = a->maxnz = a->i[A->rmap->n];
4064     a->rmax = 0;
4065     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4066     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4067     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4068     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4069     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4070     for (PetscInt i = 0; i < A->rmap->n; i++) {
4071       const PetscInt nnzr = a->i[i+1] - a->i[i];
4072       nzr += (PetscInt)!!(nnzr);
4073       a->ilen[i] = a->imax[i] = nnzr;
4074       a->rmax = PetscMax(a->rmax,nnzr);
4075     }
4076     a->nonzerorowcnt = nzr;
4077     A->preallocated = PETSC_TRUE;
4078     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4079     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4080   } else {
4081     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4082   }
4083   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4084 
4085   /* We want to allocate the CUSPARSE struct for matvec now.
4086      The code is so convoluted now that I prefer to copy zeros */
4087   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4088   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4089   A->offloadmask = PETSC_OFFLOAD_CPU;
4090   A->nonzerostate++;
4091   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4092   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4093 
4094   A->assembled = PETSC_FALSE;
4095   A->was_assembled = PETSC_FALSE;
4096   PetscFunctionReturn(0);
4097 }
4098 
4099 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4100 {
4101   PetscErrorCode     ierr;
4102   cudaError_t        cerr;
4103   Mat_SeqAIJ         *seq;
4104   Mat_SeqAIJCUSPARSE *dev;
4105   Mat                newmat;
4106   PetscInt           coo_basic = 1;
4107   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4108 
4109   PetscFunctionBegin;
4110   if (coo_i) {
4111     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
4112     if (PetscMemTypeHost(mtype)) {
4113       for (PetscCount k=0; k<coo_n; k++) {
4114         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = 0; break;}
4115       }
4116     }
4117   }
4118 
4119   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4120     ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
4121   } else {
4122     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
4123     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
4124     ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
4125     ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
4126     ierr = MatConvert(newmat,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
4127     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
4128     ierr = MatSeqAIJCUSPARSECopyToGPU(mat);CHKERRQ(ierr);
4129     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
4130 
4131     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4132     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4133     cerr = cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
4134     cerr = cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4135     cerr = cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount));CHKERRCUDA(cerr);
4136     cerr = cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4137     dev->use_extended_coo = PETSC_TRUE;
4138   }
4139   PetscFunctionReturn(0);
4140 }
4141 
4142 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4143 {
4144   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4145   const PetscCount  grid_size = gridDim.x * blockDim.x;
4146   for (; i<nnz; i+= grid_size) {
4147     PetscScalar sum = 0.0;
4148     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4149     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4150   }
4151 }
4152 
4153 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4154 {
4155   PetscErrorCode      ierr;
4156   cudaError_t         cerr;
4157   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4158   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4159   PetscCount          Annz = seq->nz;
4160   PetscMemType        memtype;
4161   const PetscScalar   *v1 = v;
4162   PetscScalar         *Aa;
4163 
4164   PetscFunctionBegin;
4165   if (dev->use_extended_coo) {
4166     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
4167     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4168       cerr = cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
4169       cerr = cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4170     }
4171 
4172     if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr);}
4173     else {ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr);}
4174 
4175     MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4176 
4177     if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);}
4178     else {ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);}
4179 
4180     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
4181   } else {
4182     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode);CHKERRQ(ierr);
4183   }
4184   PetscFunctionReturn(0);
4185 }
4186 
4187 /*@C
4188     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4189 
4190    Not collective
4191 
4192     Input Parameters:
4193 +   A - the matrix
4194 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4195 
4196     Output Parameters:
4197 +   ia - the CSR row pointers
4198 -   ja - the CSR column indices
4199 
4200     Level: developer
4201 
4202     Notes:
4203       When compressed is true, the CSR structure does not contain empty rows
4204 
4205 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4206 @*/
4207 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4208 {
4209   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4210   CsrMatrix          *csr;
4211   PetscErrorCode     ierr;
4212   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4213 
4214   PetscFunctionBegin;
4215   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4216   if (!i || !j) PetscFunctionReturn(0);
4217   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4218   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4219   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4220   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4221   csr = (CsrMatrix*)cusp->mat->mat;
4222   if (i) {
4223     if (!compressed && a->compressedrow.use) { /* need full row offset */
4224       if (!cusp->rowoffsets_gpu) {
4225         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4226         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4227         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4228       }
4229       *i = cusp->rowoffsets_gpu->data().get();
4230     } else *i = csr->row_offsets->data().get();
4231   }
4232   if (j) *j = csr->column_indices->data().get();
4233   PetscFunctionReturn(0);
4234 }
4235 
4236 /*@C
4237     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4238 
4239    Not collective
4240 
4241     Input Parameters:
4242 +   A - the matrix
4243 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4244 
4245     Output Parameters:
4246 +   ia - the CSR row pointers
4247 -   ja - the CSR column indices
4248 
4249     Level: developer
4250 
4251 .seealso: MatSeqAIJCUSPARSEGetIJ()
4252 @*/
4253 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4254 {
4255   PetscFunctionBegin;
4256   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4257   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4258   if (i) *i = NULL;
4259   if (j) *j = NULL;
4260   PetscFunctionReturn(0);
4261 }
4262 
4263 /*@C
4264    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4265 
4266    Not Collective
4267 
4268    Input Parameter:
4269 .   A - a MATSEQAIJCUSPARSE matrix
4270 
4271    Output Parameter:
4272 .   a - pointer to the device data
4273 
4274    Level: developer
4275 
4276    Notes: may trigger host-device copies if up-to-date matrix data is on host
4277 
4278 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4279 @*/
4280 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4281 {
4282   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4283   CsrMatrix          *csr;
4284   PetscErrorCode     ierr;
4285 
4286   PetscFunctionBegin;
4287   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4288   PetscValidPointer(a,2);
4289   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4290   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4291   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4292   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4293   csr = (CsrMatrix*)cusp->mat->mat;
4294   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4295   *a = csr->values->data().get();
4296   PetscFunctionReturn(0);
4297 }
4298 
4299 /*@C
4300    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4301 
4302    Not Collective
4303 
4304    Input Parameter:
4305 .   A - a MATSEQAIJCUSPARSE matrix
4306 
4307    Output Parameter:
4308 .   a - pointer to the device data
4309 
4310    Level: developer
4311 
4312 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4313 @*/
4314 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4315 {
4316   PetscFunctionBegin;
4317   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4318   PetscValidPointer(a,2);
4319   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4320   *a = NULL;
4321   PetscFunctionReturn(0);
4322 }
4323 
4324 /*@C
4325    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4326 
4327    Not Collective
4328 
4329    Input Parameter:
4330 .   A - a MATSEQAIJCUSPARSE matrix
4331 
4332    Output Parameter:
4333 .   a - pointer to the device data
4334 
4335    Level: developer
4336 
4337    Notes: may trigger host-device copies if up-to-date matrix data is on host
4338 
4339 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4340 @*/
4341 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4342 {
4343   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4344   CsrMatrix          *csr;
4345   PetscErrorCode     ierr;
4346 
4347   PetscFunctionBegin;
4348   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4349   PetscValidPointer(a,2);
4350   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4351   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4352   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4353   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4354   csr = (CsrMatrix*)cusp->mat->mat;
4355   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4356   *a = csr->values->data().get();
4357   A->offloadmask = PETSC_OFFLOAD_GPU;
4358   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4359   PetscFunctionReturn(0);
4360 }
4361 /*@C
4362    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4363 
4364    Not Collective
4365 
4366    Input Parameter:
4367 .   A - a MATSEQAIJCUSPARSE matrix
4368 
4369    Output Parameter:
4370 .   a - pointer to the device data
4371 
4372    Level: developer
4373 
4374 .seealso: MatSeqAIJCUSPARSEGetArray()
4375 @*/
4376 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4377 {
4378   PetscErrorCode ierr;
4379 
4380   PetscFunctionBegin;
4381   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4382   PetscValidPointer(a,2);
4383   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4384   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
4385   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4386   *a = NULL;
4387   PetscFunctionReturn(0);
4388 }
4389 
4390 /*@C
4391    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4392 
4393    Not Collective
4394 
4395    Input Parameter:
4396 .   A - a MATSEQAIJCUSPARSE matrix
4397 
4398    Output Parameter:
4399 .   a - pointer to the device data
4400 
4401    Level: developer
4402 
4403    Notes: does not trigger host-device copies and flags data validity on the GPU
4404 
4405 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4406 @*/
4407 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4408 {
4409   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4410   CsrMatrix          *csr;
4411   PetscErrorCode     ierr;
4412 
4413   PetscFunctionBegin;
4414   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4415   PetscValidPointer(a,2);
4416   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4417   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4418   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4419   csr = (CsrMatrix*)cusp->mat->mat;
4420   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4421   *a = csr->values->data().get();
4422   A->offloadmask = PETSC_OFFLOAD_GPU;
4423   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4424   PetscFunctionReturn(0);
4425 }
4426 
4427 /*@C
4428    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4429 
4430    Not Collective
4431 
4432    Input Parameter:
4433 .   A - a MATSEQAIJCUSPARSE matrix
4434 
4435    Output Parameter:
4436 .   a - pointer to the device data
4437 
4438    Level: developer
4439 
4440 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4441 @*/
4442 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4443 {
4444   PetscErrorCode ierr;
4445 
4446   PetscFunctionBegin;
4447   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4448   PetscValidPointer(a,2);
4449   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4450   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
4451   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4452   *a = NULL;
4453   PetscFunctionReturn(0);
4454 }
4455 
4456 struct IJCompare4
4457 {
4458   __host__ __device__
4459   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4460   {
4461     if (t1.get<0>() < t2.get<0>()) return true;
4462     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4463     return false;
4464   }
4465 };
4466 
4467 struct Shift
4468 {
4469   int _shift;
4470 
4471   Shift(int shift) : _shift(shift) {}
4472   __host__ __device__
4473   inline int operator() (const int &c)
4474   {
4475     return c + _shift;
4476   }
4477 };
4478 
4479 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4480 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4481 {
4482   PetscErrorCode               ierr;
4483   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4484   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4485   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4486   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4487   PetscInt                     Annz,Bnnz;
4488   cusparseStatus_t             stat;
4489   PetscInt                     i,m,n,zero = 0;
4490   cudaError_t                  cerr;
4491 
4492   PetscFunctionBegin;
4493   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4494   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4495   PetscValidPointer(C,4);
4496   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4497   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4498   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4499   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4500   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4501   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4502   if (reuse == MAT_INITIAL_MATRIX) {
4503     m     = A->rmap->n;
4504     n     = A->cmap->n + B->cmap->n;
4505     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4506     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4507     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4508     c     = (Mat_SeqAIJ*)(*C)->data;
4509     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4510     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4511     Ccsr  = new CsrMatrix;
4512     Cmat->cprowIndices      = NULL;
4513     c->compressedrow.use    = PETSC_FALSE;
4514     c->compressedrow.nrows  = 0;
4515     c->compressedrow.i      = NULL;
4516     c->compressedrow.rindex = NULL;
4517     Ccusp->workVector       = NULL;
4518     Ccusp->nrows    = m;
4519     Ccusp->mat      = Cmat;
4520     Ccusp->mat->mat = Ccsr;
4521     Ccsr->num_rows  = m;
4522     Ccsr->num_cols  = n;
4523     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4524     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4525     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4526     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4527     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4528     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4529     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4530     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4531     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4532     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4533     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4534     PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4535     PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4536 
4537     Acsr = (CsrMatrix*)Acusp->mat->mat;
4538     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4539     Annz = (PetscInt)Acsr->column_indices->size();
4540     Bnnz = (PetscInt)Bcsr->column_indices->size();
4541     c->nz = Annz + Bnnz;
4542     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4543     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4544     Ccsr->values = new THRUSTARRAY(c->nz);
4545     Ccsr->num_entries = c->nz;
4546     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4547     if (c->nz) {
4548       auto Acoo = new THRUSTINTARRAY32(Annz);
4549       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4550       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4551       THRUSTINTARRAY32 *Aroff,*Broff;
4552 
4553       if (a->compressedrow.use) { /* need full row offset */
4554         if (!Acusp->rowoffsets_gpu) {
4555           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4556           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4557           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4558         }
4559         Aroff = Acusp->rowoffsets_gpu;
4560       } else Aroff = Acsr->row_offsets;
4561       if (b->compressedrow.use) { /* need full row offset */
4562         if (!Bcusp->rowoffsets_gpu) {
4563           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4564           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4565           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4566         }
4567         Broff = Bcusp->rowoffsets_gpu;
4568       } else Broff = Bcsr->row_offsets;
4569       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4570       stat = cusparseXcsr2coo(Acusp->handle,
4571                               Aroff->data().get(),
4572                               Annz,
4573                               m,
4574                               Acoo->data().get(),
4575                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4576       stat = cusparseXcsr2coo(Bcusp->handle,
4577                               Broff->data().get(),
4578                               Bnnz,
4579                               m,
4580                               Bcoo->data().get(),
4581                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4582       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4583       auto Aperm = thrust::make_constant_iterator(1);
4584       auto Bperm = thrust::make_constant_iterator(0);
4585 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4586       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4587       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4588 #else
4589       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4590       auto Bcib = Bcsr->column_indices->begin();
4591       auto Bcie = Bcsr->column_indices->end();
4592       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4593 #endif
4594       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4595       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4596       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4597       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4598       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4599       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4600       auto p1 = Ccusp->cooPerm->begin();
4601       auto p2 = Ccusp->cooPerm->begin();
4602       thrust::advance(p2,Annz);
4603       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4604 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4605       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4606 #endif
4607       auto cci = thrust::make_counting_iterator(zero);
4608       auto cce = thrust::make_counting_iterator(c->nz);
4609 #if 0 //Errors on SUMMIT cuda 11.1.0
4610       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4611 #else
4612       auto pred = thrust::identity<int>();
4613       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4614       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4615 #endif
4616       stat = cusparseXcoo2csr(Ccusp->handle,
4617                               Ccoo->data().get(),
4618                               c->nz,
4619                               m,
4620                               Ccsr->row_offsets->data().get(),
4621                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4622       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4623       delete wPerm;
4624       delete Acoo;
4625       delete Bcoo;
4626       delete Ccoo;
4627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4628       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4629                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4630                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4631                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4632 #endif
4633       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4634         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4635         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4636         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4637         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4638         CsrMatrix *CcsrT = new CsrMatrix;
4639         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4640         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4641 
4642         (*C)->form_explicit_transpose = PETSC_TRUE;
4643         (*C)->transupdated = PETSC_TRUE;
4644         Ccusp->rowoffsets_gpu = NULL;
4645         CmatT->cprowIndices = NULL;
4646         CmatT->mat = CcsrT;
4647         CcsrT->num_rows = n;
4648         CcsrT->num_cols = m;
4649         CcsrT->num_entries = c->nz;
4650 
4651         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4652         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4653         CcsrT->values = new THRUSTARRAY(c->nz);
4654 
4655         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4656         auto rT = CcsrT->row_offsets->begin();
4657         if (AT) {
4658           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4659           thrust::advance(rT,-1);
4660         }
4661         if (BT) {
4662           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4663           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4664           thrust::copy(titb,tite,rT);
4665         }
4666         auto cT = CcsrT->column_indices->begin();
4667         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4668         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4669         auto vT = CcsrT->values->begin();
4670         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4671         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4672         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4673 
4674         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4675         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4676         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4677         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4678         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4679         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4680         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4681         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4682         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4683 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4684         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4685                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4686                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4687                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4688 #endif
4689         Ccusp->matTranspose = CmatT;
4690       }
4691     }
4692 
4693     c->singlemalloc = PETSC_FALSE;
4694     c->free_a       = PETSC_TRUE;
4695     c->free_ij      = PETSC_TRUE;
4696     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4697     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4698     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4699       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4700       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4701       ii   = *Ccsr->row_offsets;
4702       jj   = *Ccsr->column_indices;
4703       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4704       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4705     } else {
4706       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4707       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4708     }
4709     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4710     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4711     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4712     c->maxnz = c->nz;
4713     c->nonzerorowcnt = 0;
4714     c->rmax = 0;
4715     for (i = 0; i < m; i++) {
4716       const PetscInt nn = c->i[i+1] - c->i[i];
4717       c->ilen[i] = c->imax[i] = nn;
4718       c->nonzerorowcnt += (PetscInt)!!nn;
4719       c->rmax = PetscMax(c->rmax,nn);
4720     }
4721     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4722     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4723     (*C)->nonzerostate++;
4724     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4725     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4726     Ccusp->nonzerostate = (*C)->nonzerostate;
4727     (*C)->preallocated  = PETSC_TRUE;
4728   } else {
4729     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4730     c = (Mat_SeqAIJ*)(*C)->data;
4731     if (c->nz) {
4732       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4733       PetscCheckFalse(!Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4734       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4735       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4736       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4737       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4738       PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4739       PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4740       Acsr = (CsrMatrix*)Acusp->mat->mat;
4741       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4742       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4743       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4744       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4745       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4746       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4747       PetscCheckFalse(Ccusp->cooPerm->size() != Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4748       auto pmid = Ccusp->cooPerm->begin();
4749       thrust::advance(pmid,Acsr->num_entries);
4750       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4751       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4752                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4753       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4754                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4755       thrust::for_each(zibait,zieait,VecCUDAEquals());
4756       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4757                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4758       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4759                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4760       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4761       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4762       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4763         PetscCheckFalse(!Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4764         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4765         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4766         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4767         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4768         auto vT = CcsrT->values->begin();
4769         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4770         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4771         (*C)->transupdated = PETSC_TRUE;
4772       }
4773       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4774     }
4775   }
4776   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4777   (*C)->assembled     = PETSC_TRUE;
4778   (*C)->was_assembled = PETSC_FALSE;
4779   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4780   PetscFunctionReturn(0);
4781 }
4782 
4783 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4784 {
4785   PetscErrorCode    ierr;
4786   bool              dmem;
4787   const PetscScalar *av;
4788   cudaError_t       cerr;
4789 
4790   PetscFunctionBegin;
4791   dmem = isCudaMem(v);
4792   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4793   if (n && idx) {
4794     THRUSTINTARRAY widx(n);
4795     widx.assign(idx,idx+n);
4796     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4797 
4798     THRUSTARRAY *w = NULL;
4799     thrust::device_ptr<PetscScalar> dv;
4800     if (dmem) {
4801       dv = thrust::device_pointer_cast(v);
4802     } else {
4803       w = new THRUSTARRAY(n);
4804       dv = w->data();
4805     }
4806     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4807 
4808     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4809     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4810     thrust::for_each(zibit,zieit,VecCUDAEquals());
4811     if (w) {
4812       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4813     }
4814     delete w;
4815   } else {
4816     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4817   }
4818   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4819   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4820   PetscFunctionReturn(0);
4821 }
4822