xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1390d3a27d88add7d79c9b38bf1a895ae5e67af6)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
91 
92 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
93 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
96 
97 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
98 {
99   cusparseStatus_t   stat;
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101 
102   PetscFunctionBegin;
103   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104   cusparsestruct->stream = stream;
105   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
110 {
111   cusparseStatus_t   stat;
112   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
113 
114   PetscFunctionBegin;
115   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
116   if (cusparsestruct->handle != handle) {
117     if (cusparsestruct->handle) {
118       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
119     }
120     cusparsestruct->handle = handle;
121   }
122   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
123   PetscFunctionReturn(0);
124 }
125 
126 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
127 {
128   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
129   PetscBool          flg;
130   PetscErrorCode     ierr;
131 
132   PetscFunctionBegin;
133   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
134   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
135   if (cusparsestruct->handle) cusparsestruct->handle = 0;
136   PetscFunctionReturn(0);
137 }
138 
139 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
140 {
141   PetscFunctionBegin;
142   *type = MATSOLVERCUSPARSE;
143   PetscFunctionReturn(0);
144 }
145 
146 /*MC
147   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
148   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
149   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
150   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
151   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
152   algorithms are not recommended. This class does NOT support direct solver operations.
153 
154   Level: beginner
155 
156 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157 M*/
158 
159 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
160 {
161   PetscErrorCode ierr;
162   PetscInt       n = A->rmap->n;
163 
164   PetscFunctionBegin;
165   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
166   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
167   (*B)->factortype = ftype;
168   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
169 
170   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
171   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
172     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
173     if (!A->boundtocpu) {
174       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176     } else {
177       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
178       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
179     }
180     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
181     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
182     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
183   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
184     if (!A->boundtocpu) {
185       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
186       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
187     } else {
188       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
189       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
190     }
191     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
192     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
193   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
194 
195   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
196   (*B)->canuseordering = PETSC_TRUE;
197   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
202 {
203   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
204 
205   PetscFunctionBegin;
206   switch (op) {
207   case MAT_CUSPARSE_MULT:
208     cusparsestruct->format = format;
209     break;
210   case MAT_CUSPARSE_ALL:
211     cusparsestruct->format = format;
212     break;
213   default:
214     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
215   }
216   PetscFunctionReturn(0);
217 }
218 
219 /*@
220    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
221    operation. Only the MatMult operation can use different GPU storage formats
222    for MPIAIJCUSPARSE matrices.
223    Not Collective
224 
225    Input Parameters:
226 +  A - Matrix of type SEQAIJCUSPARSE
227 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
228 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
229 
230    Output Parameter:
231 
232    Level: intermediate
233 
234 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
235 @*/
236 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
237 {
238   PetscErrorCode ierr;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
242   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
247 {
248   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
249 
250   PetscFunctionBegin;
251   cusparsestruct->use_cpu_solve = use_cpu;
252   PetscFunctionReturn(0);
253 }
254 
255 /*@
256    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
257 
258    Input Parameters:
259 +  A - Matrix of type SEQAIJCUSPARSE
260 -  use_cpu - set flag for using the built-in CPU MatSolve
261 
262    Output Parameter:
263 
264    Notes:
265    The cuSparse LU solver currently computes the factors with the built-in CPU method
266    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
267    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
268 
269    Level: intermediate
270 
271 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
272 @*/
273 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
274 {
275   PetscErrorCode ierr;
276 
277   PetscFunctionBegin;
278   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
279   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
280   PetscFunctionReturn(0);
281 }
282 
283 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
284 {
285   PetscErrorCode ierr;
286 
287   PetscFunctionBegin;
288   switch (op) {
289     case MAT_FORM_EXPLICIT_TRANSPOSE:
290       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
291       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
292       A->form_explicit_transpose = flg;
293       break;
294     default:
295       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
296       break;
297   }
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
302 
303 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
304 {
305   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
306   IS             isrow = b->row,iscol = b->col;
307   PetscBool      row_identity,col_identity;
308   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
313   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
314   B->offloadmask = PETSC_OFFLOAD_CPU;
315   /* determine which version of MatSolve needs to be used. */
316   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
317   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
318   if (row_identity && col_identity) {
319     if (!cusparsestruct->use_cpu_solve) {
320       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
321       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
322     }
323     B->ops->matsolve = NULL;
324     B->ops->matsolvetranspose = NULL;
325   } else {
326     if (!cusparsestruct->use_cpu_solve) {
327       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
328       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
329     }
330     B->ops->matsolve = NULL;
331     B->ops->matsolvetranspose = NULL;
332   }
333 
334   /* get the triangular factors */
335   if (!cusparsestruct->use_cpu_solve) {
336     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
337   }
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
342 {
343   PetscErrorCode           ierr;
344   MatCUSPARSEStorageFormat format;
345   PetscBool                flg;
346   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
347 
348   PetscFunctionBegin;
349   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
350   if (A->factortype == MAT_FACTOR_NONE) {
351     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
352                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
353     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
354 
355     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
356                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
357     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
358     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
359     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
361     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
362                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
363     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
364 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
365     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
366 #else
367     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
368 #endif
369     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
370                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
371     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
372 
373     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
374                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
375     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
376    #endif
377   }
378   ierr = PetscOptionsTail();CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
383 {
384   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
385   PetscErrorCode               ierr;
386 
387   PetscFunctionBegin;
388   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
389   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
390   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
391   PetscFunctionReturn(0);
392 }
393 
394 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
395 {
396   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
397   PetscErrorCode               ierr;
398 
399   PetscFunctionBegin;
400   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
401   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
402   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
403   PetscFunctionReturn(0);
404 }
405 
406 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
407 {
408   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
409   PetscErrorCode               ierr;
410 
411   PetscFunctionBegin;
412   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
413   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
414   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
415   PetscFunctionReturn(0);
416 }
417 
418 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
419 {
420   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
421   PetscErrorCode               ierr;
422 
423   PetscFunctionBegin;
424   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
425   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
426   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
427   PetscFunctionReturn(0);
428 }
429 
430 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
431 {
432   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
433   PetscInt                          n = A->rmap->n;
434   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
435   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
436   cusparseStatus_t                  stat;
437   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
438   const MatScalar                   *aa = a->a,*v;
439   PetscInt                          *AiLo, *AjLo;
440   PetscInt                          i,nz, nzLower, offset, rowOffset;
441   PetscErrorCode                    ierr;
442   cudaError_t                       cerr;
443 
444   PetscFunctionBegin;
445   if (!n) PetscFunctionReturn(0);
446   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
447     try {
448       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
449       nzLower=n+ai[n]-ai[1];
450       if (!loTriFactor) {
451         PetscScalar                       *AALo;
452 
453         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
454 
455         /* Allocate Space for the lower triangular matrix */
456         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
457         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
458 
459         /* Fill the lower triangular matrix */
460         AiLo[0]  = (PetscInt) 0;
461         AiLo[n]  = nzLower;
462         AjLo[0]  = (PetscInt) 0;
463         AALo[0]  = (MatScalar) 1.0;
464         v        = aa;
465         vi       = aj;
466         offset   = 1;
467         rowOffset= 1;
468         for (i=1; i<n; i++) {
469           nz = ai[i+1] - ai[i];
470           /* additional 1 for the term on the diagonal */
471           AiLo[i]    = rowOffset;
472           rowOffset += nz+1;
473 
474           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
475           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
476 
477           offset      += nz;
478           AjLo[offset] = (PetscInt) i;
479           AALo[offset] = (MatScalar) 1.0;
480           offset      += 1;
481 
482           v  += nz;
483           vi += nz;
484         }
485 
486         /* allocate space for the triangular factor information */
487         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
488         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
489         /* Create the matrix description */
490         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
491         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
492        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
493         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
494        #else
495         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
496        #endif
497         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
498         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
499 
500         /* set the operation */
501         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502 
503         /* set the matrix */
504         loTriFactor->csrMat = new CsrMatrix;
505         loTriFactor->csrMat->num_rows = n;
506         loTriFactor->csrMat->num_cols = n;
507         loTriFactor->csrMat->num_entries = nzLower;
508 
509         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
510         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
511 
512         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
513         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
514 
515         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
516         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
517 
518         /* Create the solve analysis information */
519         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
520         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
521       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
522         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
523                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
524                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
525                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
526                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
527         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
528       #endif
529 
530         /* perform the solve analysis */
531         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
532                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
533                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
534                                  loTriFactor->csrMat->column_indices->data().get(),
535                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
536                                  loTriFactor->solveInfo,
537                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
538                                #else
539                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
540                                #endif
541         cerr = WaitForCUDA();CHKERRCUDA(cerr);
542         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
543 
544         /* assign the pointer */
545         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
546         loTriFactor->AA_h = AALo;
547         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
548         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
549         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
550       } else { /* update values only */
551         if (!loTriFactor->AA_h) {
552           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
553         }
554         /* Fill the lower triangular matrix */
555         loTriFactor->AA_h[0]  = 1.0;
556         v        = aa;
557         vi       = aj;
558         offset   = 1;
559         for (i=1; i<n; i++) {
560           nz = ai[i+1] - ai[i];
561           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
562           offset      += nz;
563           loTriFactor->AA_h[offset] = 1.0;
564           offset      += 1;
565           v  += nz;
566         }
567         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
568         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
569       }
570     } catch(char *ex) {
571       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
572     }
573   }
574   PetscFunctionReturn(0);
575 }
576 
577 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
578 {
579   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
580   PetscInt                          n = A->rmap->n;
581   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
582   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
583   cusparseStatus_t                  stat;
584   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
585   const MatScalar                   *aa = a->a,*v;
586   PetscInt                          *AiUp, *AjUp;
587   PetscInt                          i,nz, nzUpper, offset;
588   PetscErrorCode                    ierr;
589   cudaError_t                       cerr;
590 
591   PetscFunctionBegin;
592   if (!n) PetscFunctionReturn(0);
593   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
594     try {
595       /* next, figure out the number of nonzeros in the upper triangular matrix. */
596       nzUpper = adiag[0]-adiag[n];
597       if (!upTriFactor) {
598         PetscScalar *AAUp;
599 
600         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
601 
602         /* Allocate Space for the upper triangular matrix */
603         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
604         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
605 
606         /* Fill the upper triangular matrix */
607         AiUp[0]=(PetscInt) 0;
608         AiUp[n]=nzUpper;
609         offset = nzUpper;
610         for (i=n-1; i>=0; i--) {
611           v  = aa + adiag[i+1] + 1;
612           vi = aj + adiag[i+1] + 1;
613 
614           /* number of elements NOT on the diagonal */
615           nz = adiag[i] - adiag[i+1]-1;
616 
617           /* decrement the offset */
618           offset -= (nz+1);
619 
620           /* first, set the diagonal elements */
621           AjUp[offset] = (PetscInt) i;
622           AAUp[offset] = (MatScalar)1./v[nz];
623           AiUp[i]      = AiUp[i+1] - (nz+1);
624 
625           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
626           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
627         }
628 
629         /* allocate space for the triangular factor information */
630         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
631         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
632 
633         /* Create the matrix description */
634         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
635         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
636        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
637         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
638        #else
639         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
640        #endif
641         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
642         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
643 
644         /* set the operation */
645         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
646 
647         /* set the matrix */
648         upTriFactor->csrMat = new CsrMatrix;
649         upTriFactor->csrMat->num_rows = n;
650         upTriFactor->csrMat->num_cols = n;
651         upTriFactor->csrMat->num_entries = nzUpper;
652 
653         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
654         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
655 
656         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
657         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
658 
659         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
660         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
661 
662         /* Create the solve analysis information */
663         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
664         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
665       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
666         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
667                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
668                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
669                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
670                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
671         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
672       #endif
673 
674         /* perform the solve analysis */
675         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
676                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
677                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
678                                  upTriFactor->csrMat->column_indices->data().get(),
679                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
680                                  upTriFactor->solveInfo,
681                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
682                                #else
683                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
684                                #endif
685         cerr = WaitForCUDA();CHKERRCUDA(cerr);
686         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
687 
688         /* assign the pointer */
689         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
690         upTriFactor->AA_h = AAUp;
691         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
692         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
693         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
694       } else {
695         if (!upTriFactor->AA_h) {
696           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
697         }
698         /* Fill the upper triangular matrix */
699         offset = nzUpper;
700         for (i=n-1; i>=0; i--) {
701           v  = aa + adiag[i+1] + 1;
702 
703           /* number of elements NOT on the diagonal */
704           nz = adiag[i] - adiag[i+1]-1;
705 
706           /* decrement the offset */
707           offset -= (nz+1);
708 
709           /* first, set the diagonal elements */
710           upTriFactor->AA_h[offset] = 1./v[nz];
711           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
712         }
713         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
714         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
715       }
716     } catch(char *ex) {
717       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
718     }
719   }
720   PetscFunctionReturn(0);
721 }
722 
723 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
724 {
725   PetscErrorCode               ierr;
726   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
727   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
728   IS                           isrow = a->row,iscol = a->icol;
729   PetscBool                    row_identity,col_identity;
730   PetscInt                     n = A->rmap->n;
731 
732   PetscFunctionBegin;
733   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
734   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
735   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
736 
737   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
738   cusparseTriFactors->nnz=a->nz;
739 
740   A->offloadmask = PETSC_OFFLOAD_BOTH;
741   /* lower triangular indices */
742   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
743   if (!row_identity && !cusparseTriFactors->rpermIndices) {
744     const PetscInt *r;
745 
746     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
747     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
748     cusparseTriFactors->rpermIndices->assign(r, r+n);
749     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
750     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
751   }
752 
753   /* upper triangular indices */
754   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
755   if (!col_identity && !cusparseTriFactors->cpermIndices) {
756     const PetscInt *c;
757 
758     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
759     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
760     cusparseTriFactors->cpermIndices->assign(c, c+n);
761     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
762     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
763   }
764   PetscFunctionReturn(0);
765 }
766 
767 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
768 {
769   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
770   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
771   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
772   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
773   cusparseStatus_t                  stat;
774   PetscErrorCode                    ierr;
775   cudaError_t                       cerr;
776   PetscInt                          *AiUp, *AjUp;
777   PetscScalar                       *AAUp;
778   PetscScalar                       *AALo;
779   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
780   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
781   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
782   const MatScalar                   *aa = b->a,*v;
783 
784   PetscFunctionBegin;
785   if (!n) PetscFunctionReturn(0);
786   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
787     try {
788       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
789       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
790       if (!upTriFactor && !loTriFactor) {
791         /* Allocate Space for the upper triangular matrix */
792         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
793         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
794 
795         /* Fill the upper triangular matrix */
796         AiUp[0]=(PetscInt) 0;
797         AiUp[n]=nzUpper;
798         offset = 0;
799         for (i=0; i<n; i++) {
800           /* set the pointers */
801           v  = aa + ai[i];
802           vj = aj + ai[i];
803           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
804 
805           /* first, set the diagonal elements */
806           AjUp[offset] = (PetscInt) i;
807           AAUp[offset] = (MatScalar)1.0/v[nz];
808           AiUp[i]      = offset;
809           AALo[offset] = (MatScalar)1.0/v[nz];
810 
811           offset+=1;
812           if (nz>0) {
813             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
814             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
815             for (j=offset; j<offset+nz; j++) {
816               AAUp[j] = -AAUp[j];
817               AALo[j] = AAUp[j]/v[nz];
818             }
819             offset+=nz;
820           }
821         }
822 
823         /* allocate space for the triangular factor information */
824         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
825         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826 
827         /* Create the matrix description */
828         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
829         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832        #else
833         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834        #endif
835         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
837 
838         /* set the matrix */
839         upTriFactor->csrMat = new CsrMatrix;
840         upTriFactor->csrMat->num_rows = A->rmap->n;
841         upTriFactor->csrMat->num_cols = A->cmap->n;
842         upTriFactor->csrMat->num_entries = a->nz;
843 
844         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
852 
853         /* set the operation */
854         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
855 
856         /* Create the solve analysis information */
857         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
861                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
862                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
863                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
864                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866       #endif
867 
868         /* perform the solve analysis */
869         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
870                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
871                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
872                                  upTriFactor->csrMat->column_indices->data().get(),
873                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874                                  upTriFactor->solveInfo,
875                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876                                 #else
877                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878                                 #endif
879         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881 
882         /* assign the pointer */
883         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
884 
885         /* allocate space for the triangular factor information */
886         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
887         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
888 
889         /* Create the matrix description */
890         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
891         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
892        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
893         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
894        #else
895         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
896        #endif
897         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
898         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
899 
900         /* set the operation */
901         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
902 
903         /* set the matrix */
904         loTriFactor->csrMat = new CsrMatrix;
905         loTriFactor->csrMat->num_rows = A->rmap->n;
906         loTriFactor->csrMat->num_cols = A->cmap->n;
907         loTriFactor->csrMat->num_entries = a->nz;
908 
909         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
910         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
911 
912         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
913         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
914 
915         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
916         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
917 
918         /* Create the solve analysis information */
919         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
920         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
921       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
922         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
923                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
924                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
925                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
926                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
927         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
928       #endif
929 
930         /* perform the solve analysis */
931         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
932                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
933                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
934                                  loTriFactor->csrMat->column_indices->data().get(),
935                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
936                                  loTriFactor->solveInfo,
937                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
938                                 #else
939                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
940                                 #endif
941         cerr = WaitForCUDA();CHKERRCUDA(cerr);
942         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
943 
944         /* assign the pointer */
945         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
946 
947         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
948         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
949         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
950       } else {
951         /* Fill the upper triangular matrix */
952         offset = 0;
953         for (i=0; i<n; i++) {
954           /* set the pointers */
955           v  = aa + ai[i];
956           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
957 
958           /* first, set the diagonal elements */
959           AAUp[offset] = 1.0/v[nz];
960           AALo[offset] = 1.0/v[nz];
961 
962           offset+=1;
963           if (nz>0) {
964             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
965             for (j=offset; j<offset+nz; j++) {
966               AAUp[j] = -AAUp[j];
967               AALo[j] = AAUp[j]/v[nz];
968             }
969             offset+=nz;
970           }
971         }
972         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
973         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
974         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
975         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
976         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
977       }
978       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
979       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
980     } catch(char *ex) {
981       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
982     }
983   }
984   PetscFunctionReturn(0);
985 }
986 
987 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
988 {
989   PetscErrorCode               ierr;
990   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
991   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
992   IS                           ip = a->row;
993   PetscBool                    perm_identity;
994   PetscInt                     n = A->rmap->n;
995 
996   PetscFunctionBegin;
997   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
998   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
999   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
1000   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
1001 
1002   A->offloadmask = PETSC_OFFLOAD_BOTH;
1003 
1004   /* lower triangular indices */
1005   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1006   if (!perm_identity) {
1007     IS             iip;
1008     const PetscInt *irip,*rip;
1009 
1010     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1011     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1012     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1013     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1014     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1015     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1016     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1017     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1018     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1019     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1020     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1021   }
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1026 {
1027   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1028   IS             ip = b->row;
1029   PetscBool      perm_identity;
1030   PetscErrorCode ierr;
1031 
1032   PetscFunctionBegin;
1033   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1034   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1035   B->offloadmask = PETSC_OFFLOAD_CPU;
1036   /* determine which version of MatSolve needs to be used. */
1037   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1038   if (perm_identity) {
1039     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1040     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1041     B->ops->matsolve = NULL;
1042     B->ops->matsolvetranspose = NULL;
1043   } else {
1044     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1045     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1046     B->ops->matsolve = NULL;
1047     B->ops->matsolvetranspose = NULL;
1048   }
1049 
1050   /* get the triangular factors */
1051   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1056 {
1057   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1058   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1059   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1060   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1061   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1062   cusparseStatus_t                  stat;
1063   cusparseIndexBase_t               indexBase;
1064   cusparseMatrixType_t              matrixType;
1065   cusparseFillMode_t                fillMode;
1066   cusparseDiagType_t                diagType;
1067   cudaError_t                       cerr;
1068   PetscErrorCode                    ierr;
1069 
1070   PetscFunctionBegin;
1071   /* allocate space for the transpose of the lower triangular factor */
1072   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1073   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1074 
1075   /* set the matrix descriptors of the lower triangular factor */
1076   matrixType = cusparseGetMatType(loTriFactor->descr);
1077   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1078   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1079     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1080   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1081 
1082   /* Create the matrix description */
1083   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1084   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1085   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1086   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1087   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1088 
1089   /* set the operation */
1090   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1091 
1092   /* allocate GPU space for the CSC of the lower triangular factor*/
1093   loTriFactorT->csrMat = new CsrMatrix;
1094   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1095   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1096   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1097   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1098   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1099   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1100 
1101   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1102 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1103   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1104                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1105                                        loTriFactor->csrMat->values->data().get(),
1106                                        loTriFactor->csrMat->row_offsets->data().get(),
1107                                        loTriFactor->csrMat->column_indices->data().get(),
1108                                        loTriFactorT->csrMat->values->data().get(),
1109                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1110                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1111                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1112   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1113 #endif
1114 
1115   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1116   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1117                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1118                           loTriFactor->csrMat->values->data().get(),
1119                           loTriFactor->csrMat->row_offsets->data().get(),
1120                           loTriFactor->csrMat->column_indices->data().get(),
1121                           loTriFactorT->csrMat->values->data().get(),
1122                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1123                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1124                           CUSPARSE_ACTION_NUMERIC, indexBase,
1125                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1126                         #else
1127                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1128                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1129                         #endif
1130   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1131   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1132 
1133   /* Create the solve analysis information */
1134   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1135   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1136 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1137   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1138                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1139                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1140                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1141                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   /* perform the solve analysis */
1146   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1147                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1148                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1149                            loTriFactorT->csrMat->column_indices->data().get(),
1150                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1151                            loTriFactorT->solveInfo,
1152                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1153                           #else
1154                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1155                           #endif
1156   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1157   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158 
1159   /* assign the pointer */
1160   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1161 
1162   /*********************************************/
1163   /* Now the Transpose of the Upper Tri Factor */
1164   /*********************************************/
1165 
1166   /* allocate space for the transpose of the upper triangular factor */
1167   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1168   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1169 
1170   /* set the matrix descriptors of the upper triangular factor */
1171   matrixType = cusparseGetMatType(upTriFactor->descr);
1172   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1173   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1174     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1175   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1176 
1177   /* Create the matrix description */
1178   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1179   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1180   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1181   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1182   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1183 
1184   /* set the operation */
1185   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1186 
1187   /* allocate GPU space for the CSC of the upper triangular factor*/
1188   upTriFactorT->csrMat = new CsrMatrix;
1189   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1190   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1191   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1192   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1193   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1194   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1195 
1196   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1198   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1199                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1200                                 upTriFactor->csrMat->values->data().get(),
1201                                 upTriFactor->csrMat->row_offsets->data().get(),
1202                                 upTriFactor->csrMat->column_indices->data().get(),
1203                                 upTriFactorT->csrMat->values->data().get(),
1204                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1205                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1206                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1207   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1208 #endif
1209 
1210   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1212                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1213                           upTriFactor->csrMat->values->data().get(),
1214                           upTriFactor->csrMat->row_offsets->data().get(),
1215                           upTriFactor->csrMat->column_indices->data().get(),
1216                           upTriFactorT->csrMat->values->data().get(),
1217                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1218                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1219                           CUSPARSE_ACTION_NUMERIC, indexBase,
1220                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1221                         #else
1222                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1223                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1224                         #endif
1225 
1226   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1227   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1228 
1229   /* Create the solve analysis information */
1230   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1231   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1232   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1233   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1234                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1235                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1236                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1237                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1238   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1239   #endif
1240 
1241   /* perform the solve analysis */
1242   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1243                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1244                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1245                            upTriFactorT->csrMat->column_indices->data().get(),
1246                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1247                            upTriFactorT->solveInfo,
1248                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1249                           #else
1250                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1251                           #endif
1252 
1253   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1254   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1255 
1256   /* assign the pointer */
1257   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 struct PetscScalarToPetscInt
1262 {
1263   __host__ __device__
1264   PetscInt operator()(PetscScalar s)
1265   {
1266     return (PetscInt)PetscRealPart(s);
1267   }
1268 };
1269 
1270 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1271 {
1272   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1273   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1274   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1275   cusparseStatus_t             stat;
1276   cusparseIndexBase_t          indexBase;
1277   cudaError_t                  err;
1278   PetscErrorCode               ierr;
1279 
1280   PetscFunctionBegin;
1281   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1282   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1283   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1284   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1285   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1286   if (A->transupdated) PetscFunctionReturn(0);
1287   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1288   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1289   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1290     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1291   }
1292   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1293     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1294     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1295     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1296     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1297     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1298 
1299     /* set alpha and beta */
1300     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1301     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1302     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1303     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1304     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1305     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1306 
1307     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1308       CsrMatrix *matrixT = new CsrMatrix;
1309       matstructT->mat = matrixT;
1310       matrixT->num_rows = A->cmap->n;
1311       matrixT->num_cols = A->rmap->n;
1312       matrixT->num_entries = a->nz;
1313       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1314       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1315       matrixT->values = new THRUSTARRAY(a->nz);
1316 
1317       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1318       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1319 
1320      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1321       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1322         stat = cusparseCreateCsr(&matstructT->matDescr,
1323                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1324                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1325                                matrixT->values->data().get(),
1326                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1327                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1328       #else
1329         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1330            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1331 
1332            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1333            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1334            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1335         */
1336         if (matrixT->num_entries) {
1337           stat = cusparseCreateCsr(&matstructT->matDescr,
1338                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1339                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1340                                  matrixT->values->data().get(),
1341                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1342                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1343 
1344         } else {
1345           matstructT->matDescr = NULL;
1346           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1347         }
1348       #endif
1349      #endif
1350     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1351    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1352       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1353    #else
1354       CsrMatrix *temp  = new CsrMatrix;
1355       CsrMatrix *tempT = new CsrMatrix;
1356       /* First convert HYB to CSR */
1357       temp->num_rows = A->rmap->n;
1358       temp->num_cols = A->cmap->n;
1359       temp->num_entries = a->nz;
1360       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1361       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1362       temp->values = new THRUSTARRAY(a->nz);
1363 
1364       stat = cusparse_hyb2csr(cusparsestruct->handle,
1365                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1366                               temp->values->data().get(),
1367                               temp->row_offsets->data().get(),
1368                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1369 
1370       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1371       tempT->num_rows = A->rmap->n;
1372       tempT->num_cols = A->cmap->n;
1373       tempT->num_entries = a->nz;
1374       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1375       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1376       tempT->values = new THRUSTARRAY(a->nz);
1377 
1378       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1379                               temp->num_cols, temp->num_entries,
1380                               temp->values->data().get(),
1381                               temp->row_offsets->data().get(),
1382                               temp->column_indices->data().get(),
1383                               tempT->values->data().get(),
1384                               tempT->column_indices->data().get(),
1385                               tempT->row_offsets->data().get(),
1386                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1387 
1388       /* Last, convert CSC to HYB */
1389       cusparseHybMat_t hybMat;
1390       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1391       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1392         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1393       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1394                               matstructT->descr, tempT->values->data().get(),
1395                               tempT->row_offsets->data().get(),
1396                               tempT->column_indices->data().get(),
1397                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1398 
1399       /* assign the pointer */
1400       matstructT->mat = hybMat;
1401       A->transupdated = PETSC_TRUE;
1402       /* delete temporaries */
1403       if (tempT) {
1404         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1405         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1406         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1407         delete (CsrMatrix*) tempT;
1408       }
1409       if (temp) {
1410         if (temp->values) delete (THRUSTARRAY*) temp->values;
1411         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1412         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1413         delete (CsrMatrix*) temp;
1414       }
1415      #endif
1416     }
1417   }
1418   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1419     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1420     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1421     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1422     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1423     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1424     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1425     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1426     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1427     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1428     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1429     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1430       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1431       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1432       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1433     }
1434     if (!cusparsestruct->csr2csc_i) {
1435       THRUSTARRAY csr2csc_a(matrix->num_entries);
1436       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1437 
1438       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1439      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1440       void   *csr2cscBuffer;
1441       size_t csr2cscBufferSize;
1442       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1443                                            A->cmap->n, matrix->num_entries,
1444                                            matrix->values->data().get(),
1445                                            cusparsestruct->rowoffsets_gpu->data().get(),
1446                                            matrix->column_indices->data().get(),
1447                                            matrixT->values->data().get(),
1448                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1449                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1450                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1451       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1452      #endif
1453 
1454       if (matrix->num_entries) {
1455         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1456            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1457            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1458 
1459            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1460            should be filled with indexBase. So I just take a shortcut here.
1461         */
1462         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1463                               A->cmap->n,matrix->num_entries,
1464                               csr2csc_a.data().get(),
1465                               cusparsestruct->rowoffsets_gpu->data().get(),
1466                               matrix->column_indices->data().get(),
1467                               matrixT->values->data().get(),
1468                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1469                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1470                               CUSPARSE_ACTION_NUMERIC,indexBase,
1471                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1472                              #else
1473                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1474                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1475                              #endif
1476       } else {
1477         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1478       }
1479 
1480       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1481       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1482      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1483       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1484      #endif
1485     }
1486     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1487                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1488                                                      matrixT->values->begin()));
1489   }
1490   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1491   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1492   /* the compressed row indices is not used for matTranspose */
1493   matstructT->cprowIndices = NULL;
1494   /* assign the pointer */
1495   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1496   A->transupdated = PETSC_TRUE;
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1501 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1502 {
1503   PetscInt                              n = xx->map->n;
1504   const PetscScalar                     *barray;
1505   PetscScalar                           *xarray;
1506   thrust::device_ptr<const PetscScalar> bGPU;
1507   thrust::device_ptr<PetscScalar>       xGPU;
1508   cusparseStatus_t                      stat;
1509   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1510   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1511   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1512   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1513   PetscErrorCode                        ierr;
1514 
1515   PetscFunctionBegin;
1516   /* Analyze the matrix and create the transpose ... on the fly */
1517   if (!loTriFactorT && !upTriFactorT) {
1518     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1519     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1520     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1521   }
1522 
1523   /* Get the GPU pointers */
1524   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1525   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1526   xGPU = thrust::device_pointer_cast(xarray);
1527   bGPU = thrust::device_pointer_cast(barray);
1528 
1529   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1530   /* First, reorder with the row permutation */
1531   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1532                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1533                xGPU);
1534 
1535   /* First, solve U */
1536   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1537                         upTriFactorT->csrMat->num_rows,
1538                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539                         upTriFactorT->csrMat->num_entries,
1540                       #endif
1541                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1542                         upTriFactorT->csrMat->values->data().get(),
1543                         upTriFactorT->csrMat->row_offsets->data().get(),
1544                         upTriFactorT->csrMat->column_indices->data().get(),
1545                         upTriFactorT->solveInfo,
1546                         xarray,
1547                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548                         tempGPU->data().get(),
1549                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1550                       #else
1551                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1552                       #endif
1553 
1554   /* Then, solve L */
1555   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1556                         loTriFactorT->csrMat->num_rows,
1557                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1558                         loTriFactorT->csrMat->num_entries,
1559                       #endif
1560                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1561                         loTriFactorT->csrMat->values->data().get(),
1562                         loTriFactorT->csrMat->row_offsets->data().get(),
1563                         loTriFactorT->csrMat->column_indices->data().get(),
1564                         loTriFactorT->solveInfo,
1565                         tempGPU->data().get(),
1566                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1567                         xarray,
1568                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1569                       #else
1570                          xarray);CHKERRCUSPARSE(stat);
1571                       #endif
1572 
1573   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1575                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1576                tempGPU->begin());
1577 
1578   /* Copy the temporary to the full solution. */
1579   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1580 
1581   /* restore */
1582   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1583   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1584   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1590 {
1591   const PetscScalar                 *barray;
1592   PetscScalar                       *xarray;
1593   cusparseStatus_t                  stat;
1594   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1595   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1596   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1597   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1598   PetscErrorCode                    ierr;
1599 
1600   PetscFunctionBegin;
1601   /* Analyze the matrix and create the transpose ... on the fly */
1602   if (!loTriFactorT && !upTriFactorT) {
1603     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1604     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1605     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1606   }
1607 
1608   /* Get the GPU pointers */
1609   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1610   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1611 
1612   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1613   /* First, solve U */
1614   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1615                         upTriFactorT->csrMat->num_rows,
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         upTriFactorT->csrMat->num_entries,
1618                       #endif
1619                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1620                         upTriFactorT->csrMat->values->data().get(),
1621                         upTriFactorT->csrMat->row_offsets->data().get(),
1622                         upTriFactorT->csrMat->column_indices->data().get(),
1623                         upTriFactorT->solveInfo,
1624                         barray,
1625                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1626                         tempGPU->data().get(),
1627                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1628                       #else
1629                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1630                       #endif
1631 
1632   /* Then, solve L */
1633   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1634                         loTriFactorT->csrMat->num_rows,
1635                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1636                         loTriFactorT->csrMat->num_entries,
1637                       #endif
1638                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1639                         loTriFactorT->csrMat->values->data().get(),
1640                         loTriFactorT->csrMat->row_offsets->data().get(),
1641                         loTriFactorT->csrMat->column_indices->data().get(),
1642                         loTriFactorT->solveInfo,
1643                         tempGPU->data().get(),
1644                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1645                         xarray,
1646                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1647                       #else
1648                         xarray);CHKERRCUSPARSE(stat);
1649                       #endif
1650 
1651   /* restore */
1652   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1653   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1654   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1655   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1660 {
1661   const PetscScalar                     *barray;
1662   PetscScalar                           *xarray;
1663   thrust::device_ptr<const PetscScalar> bGPU;
1664   thrust::device_ptr<PetscScalar>       xGPU;
1665   cusparseStatus_t                      stat;
1666   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1667   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1668   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1669   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1670   PetscErrorCode                        ierr;
1671 
1672   PetscFunctionBegin;
1673 
1674   /* Get the GPU pointers */
1675   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1676   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1677   xGPU = thrust::device_pointer_cast(xarray);
1678   bGPU = thrust::device_pointer_cast(barray);
1679 
1680   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1681   /* First, reorder with the row permutation */
1682   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1683                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1684                tempGPU->begin());
1685 
1686   /* Next, solve L */
1687   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1688                         loTriFactor->csrMat->num_rows,
1689                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1690                         loTriFactor->csrMat->num_entries,
1691                       #endif
1692                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1693                         loTriFactor->csrMat->values->data().get(),
1694                         loTriFactor->csrMat->row_offsets->data().get(),
1695                         loTriFactor->csrMat->column_indices->data().get(),
1696                         loTriFactor->solveInfo,
1697                         tempGPU->data().get(),
1698                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1699                          xarray,
1700                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1701                       #else
1702                          xarray);CHKERRCUSPARSE(stat);
1703                       #endif
1704 
1705   /* Then, solve U */
1706   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1707                         upTriFactor->csrMat->num_rows,
1708                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1709                         upTriFactor->csrMat->num_entries,
1710                       #endif
1711                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1712                         upTriFactor->csrMat->values->data().get(),
1713                         upTriFactor->csrMat->row_offsets->data().get(),
1714                         upTriFactor->csrMat->column_indices->data().get(),
1715                         upTriFactor->solveInfo,xarray,
1716                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1717                         tempGPU->data().get(),
1718                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1719                       #else
1720                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1721                       #endif
1722 
1723   /* Last, reorder with the column permutation */
1724   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1725                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1726                xGPU);
1727 
1728   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1729   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1730   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1731   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1736 {
1737   const PetscScalar                 *barray;
1738   PetscScalar                       *xarray;
1739   cusparseStatus_t                  stat;
1740   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1741   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1742   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1743   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1744   PetscErrorCode                    ierr;
1745 
1746   PetscFunctionBegin;
1747   /* Get the GPU pointers */
1748   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1749   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1750 
1751   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1752   /* First, solve L */
1753   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1754                         loTriFactor->csrMat->num_rows,
1755                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1756                         loTriFactor->csrMat->num_entries,
1757                       #endif
1758                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1759                         loTriFactor->csrMat->values->data().get(),
1760                         loTriFactor->csrMat->row_offsets->data().get(),
1761                         loTriFactor->csrMat->column_indices->data().get(),
1762                         loTriFactor->solveInfo,
1763                         barray,
1764                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1765                         tempGPU->data().get(),
1766                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1767                       #else
1768                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1769                       #endif
1770 
1771   /* Next, solve U */
1772   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1773                         upTriFactor->csrMat->num_rows,
1774                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1775                         upTriFactor->csrMat->num_entries,
1776                       #endif
1777                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1778                         upTriFactor->csrMat->values->data().get(),
1779                         upTriFactor->csrMat->row_offsets->data().get(),
1780                         upTriFactor->csrMat->column_indices->data().get(),
1781                         upTriFactor->solveInfo,
1782                         tempGPU->data().get(),
1783                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1784                         xarray,
1785                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1786                       #else
1787                         xarray);CHKERRCUSPARSE(stat);
1788                       #endif
1789 
1790   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1791   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1792   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1793   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1794   PetscFunctionReturn(0);
1795 }
1796 
1797 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1798 {
1799   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1800   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1801   cudaError_t        cerr;
1802   PetscErrorCode     ierr;
1803 
1804   PetscFunctionBegin;
1805   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1806     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1807 
1808     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1809     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1810     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1811     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1812     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1813     A->offloadmask = PETSC_OFFLOAD_BOTH;
1814   }
1815   PetscFunctionReturn(0);
1816 }
1817 
1818 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1819 {
1820   PetscErrorCode ierr;
1821 
1822   PetscFunctionBegin;
1823   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1824   *array = ((Mat_SeqAIJ*)A->data)->a;
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1829 {
1830   PetscFunctionBegin;
1831   A->offloadmask = PETSC_OFFLOAD_CPU;
1832   *array         = NULL;
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1837 {
1838   PetscErrorCode ierr;
1839 
1840   PetscFunctionBegin;
1841   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1842   *array = ((Mat_SeqAIJ*)A->data)->a;
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1847 {
1848   PetscFunctionBegin;
1849   *array = NULL;
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1854 {
1855   PetscFunctionBegin;
1856   *array = ((Mat_SeqAIJ*)A->data)->a;
1857   PetscFunctionReturn(0);
1858 }
1859 
1860 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1861 {
1862   PetscFunctionBegin;
1863   A->offloadmask = PETSC_OFFLOAD_CPU;
1864   *array         = NULL;
1865   PetscFunctionReturn(0);
1866 }
1867 
1868 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1869 {
1870   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1871   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1872   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1873   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1874   PetscErrorCode               ierr;
1875   cusparseStatus_t             stat;
1876   PetscBool                    both = PETSC_TRUE;
1877   cudaError_t                  err;
1878 
1879   PetscFunctionBegin;
1880   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1881   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1882     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1883       CsrMatrix *matrix;
1884       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1885 
1886       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1887       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888       matrix->values->assign(a->a, a->a+a->nz);
1889       err  = WaitForCUDA();CHKERRCUDA(err);
1890       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1891       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1892       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1893     } else {
1894       PetscInt nnz;
1895       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1896       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1897       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1898       delete cusparsestruct->workVector;
1899       delete cusparsestruct->rowoffsets_gpu;
1900       cusparsestruct->workVector = NULL;
1901       cusparsestruct->rowoffsets_gpu = NULL;
1902       try {
1903         if (a->compressedrow.use) {
1904           m    = a->compressedrow.nrows;
1905           ii   = a->compressedrow.i;
1906           ridx = a->compressedrow.rindex;
1907         } else {
1908           m    = A->rmap->n;
1909           ii   = a->i;
1910           ridx = NULL;
1911         }
1912         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1913         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1914         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1915         else nnz = a->nz;
1916 
1917         /* create cusparse matrix */
1918         cusparsestruct->nrows = m;
1919         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1920         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1921         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1922         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1923 
1924         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1925         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1926         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1927         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1928         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1929         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1930         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1931 
1932         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1933         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1934           /* set the matrix */
1935           CsrMatrix *mat= new CsrMatrix;
1936           mat->num_rows = m;
1937           mat->num_cols = A->cmap->n;
1938           mat->num_entries = nnz;
1939           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1940           mat->row_offsets->assign(ii, ii + m+1);
1941 
1942           mat->column_indices = new THRUSTINTARRAY32(nnz);
1943           mat->column_indices->assign(a->j, a->j+nnz);
1944 
1945           mat->values = new THRUSTARRAY(nnz);
1946           if (a->a) mat->values->assign(a->a, a->a+nnz);
1947 
1948           /* assign the pointer */
1949           matstruct->mat = mat;
1950          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1952             stat = cusparseCreateCsr(&matstruct->matDescr,
1953                                     mat->num_rows, mat->num_cols, mat->num_entries,
1954                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1955                                     mat->values->data().get(),
1956                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1957                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1958           }
1959          #endif
1960         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1961          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1962           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1963          #else
1964           CsrMatrix *mat= new CsrMatrix;
1965           mat->num_rows = m;
1966           mat->num_cols = A->cmap->n;
1967           mat->num_entries = nnz;
1968           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1969           mat->row_offsets->assign(ii, ii + m+1);
1970 
1971           mat->column_indices = new THRUSTINTARRAY32(nnz);
1972           mat->column_indices->assign(a->j, a->j+nnz);
1973 
1974           mat->values = new THRUSTARRAY(nnz);
1975           if (a->a) mat->values->assign(a->a, a->a+nnz);
1976 
1977           cusparseHybMat_t hybMat;
1978           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1979           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1980             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1981           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1982               matstruct->descr, mat->values->data().get(),
1983               mat->row_offsets->data().get(),
1984               mat->column_indices->data().get(),
1985               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1986           /* assign the pointer */
1987           matstruct->mat = hybMat;
1988 
1989           if (mat) {
1990             if (mat->values) delete (THRUSTARRAY*)mat->values;
1991             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1992             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1993             delete (CsrMatrix*)mat;
1994           }
1995          #endif
1996         }
1997 
1998         /* assign the compressed row indices */
1999         if (a->compressedrow.use) {
2000           cusparsestruct->workVector = new THRUSTARRAY(m);
2001           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2002           matstruct->cprowIndices->assign(ridx,ridx+m);
2003           tmp = m;
2004         } else {
2005           cusparsestruct->workVector = NULL;
2006           matstruct->cprowIndices    = NULL;
2007           tmp = 0;
2008         }
2009         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2010 
2011         /* assign the pointer */
2012         cusparsestruct->mat = matstruct;
2013       } catch(char *ex) {
2014         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2015       }
2016       err  = WaitForCUDA();CHKERRCUDA(err);
2017       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2018       cusparsestruct->nonzerostate = A->nonzerostate;
2019     }
2020     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2021   }
2022   PetscFunctionReturn(0);
2023 }
2024 
2025 struct VecCUDAPlusEquals
2026 {
2027   template <typename Tuple>
2028   __host__ __device__
2029   void operator()(Tuple t)
2030   {
2031     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2032   }
2033 };
2034 
2035 struct VecCUDAEquals
2036 {
2037   template <typename Tuple>
2038   __host__ __device__
2039   void operator()(Tuple t)
2040   {
2041     thrust::get<1>(t) = thrust::get<0>(t);
2042   }
2043 };
2044 
2045 struct VecCUDAEqualsReverse
2046 {
2047   template <typename Tuple>
2048   __host__ __device__
2049   void operator()(Tuple t)
2050   {
2051     thrust::get<0>(t) = thrust::get<1>(t);
2052   }
2053 };
2054 
2055 struct MatMatCusparse {
2056   PetscBool             cisdense;
2057   PetscScalar           *Bt;
2058   Mat                   X;
2059   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2060   PetscLogDouble        flops;
2061   CsrMatrix             *Bcsr;
2062 
2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2064   cusparseSpMatDescr_t  matSpBDescr;
2065   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2066   cusparseDnMatDescr_t  matBDescr;
2067   cusparseDnMatDescr_t  matCDescr;
2068   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2069  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2070   void                  *dBuffer4;
2071   void                  *dBuffer5;
2072  #endif
2073   size_t                mmBufferSize;
2074   void                  *mmBuffer;
2075   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2076   cusparseSpGEMMDescr_t spgemmDesc;
2077 #endif
2078 };
2079 
2080 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2081 {
2082   PetscErrorCode   ierr;
2083   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2084   cudaError_t      cerr;
2085  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2086   cusparseStatus_t stat;
2087  #endif
2088 
2089   PetscFunctionBegin;
2090   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2091   delete mmdata->Bcsr;
2092  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2093   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2094   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2095   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2096   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2097  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2098   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2099   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2100  #endif
2101   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2102   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2103  #endif
2104   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2105   ierr = PetscFree(data);CHKERRQ(ierr);
2106   PetscFunctionReturn(0);
2107 }
2108 
2109 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2110 
2111 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2112 {
2113   Mat_Product                  *product = C->product;
2114   Mat                          A,B;
2115   PetscInt                     m,n,blda,clda;
2116   PetscBool                    flg,biscuda;
2117   Mat_SeqAIJCUSPARSE           *cusp;
2118   cusparseStatus_t             stat;
2119   cusparseOperation_t          opA;
2120   const PetscScalar            *barray;
2121   PetscScalar                  *carray;
2122   PetscErrorCode               ierr;
2123   MatMatCusparse               *mmdata;
2124   Mat_SeqAIJCUSPARSEMultStruct *mat;
2125   CsrMatrix                    *csrmat;
2126 
2127   PetscFunctionBegin;
2128   MatCheckProduct(C,1);
2129   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2130   mmdata = (MatMatCusparse*)product->data;
2131   A    = product->A;
2132   B    = product->B;
2133   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2134   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2135   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2136      Instead of silently accepting the wrong answer, I prefer to raise the error */
2137   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2138   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2139   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2140   switch (product->type) {
2141   case MATPRODUCT_AB:
2142   case MATPRODUCT_PtAP:
2143     mat = cusp->mat;
2144     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2145     m   = A->rmap->n;
2146     n   = B->cmap->n;
2147     break;
2148   case MATPRODUCT_AtB:
2149     if (!A->form_explicit_transpose) {
2150       mat = cusp->mat;
2151       opA = CUSPARSE_OPERATION_TRANSPOSE;
2152     } else {
2153       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2154       mat  = cusp->matTranspose;
2155       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2156     }
2157     m = A->cmap->n;
2158     n = B->cmap->n;
2159     break;
2160   case MATPRODUCT_ABt:
2161   case MATPRODUCT_RARt:
2162     mat = cusp->mat;
2163     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2164     m   = A->rmap->n;
2165     n   = B->rmap->n;
2166     break;
2167   default:
2168     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2169   }
2170   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2171   csrmat = (CsrMatrix*)mat->mat;
2172   /* if the user passed a CPU matrix, copy the data to the GPU */
2173   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2174   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2175   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2176 
2177   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2178   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2179     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2180     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2181   } else {
2182     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2183     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2184   }
2185 
2186   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2187  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2188   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2189   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2190   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2191     size_t mmBufferSize;
2192     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2193     if (!mmdata->matBDescr) {
2194       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2195       mmdata->Blda = blda;
2196     }
2197 
2198     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2199     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2200       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2201       mmdata->Clda = clda;
2202     }
2203 
2204     if (!mat->matDescr) {
2205       stat = cusparseCreateCsr(&mat->matDescr,
2206                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2207                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2208                                csrmat->values->data().get(),
2209                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2210                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2211     }
2212     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2213                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2214                                    mmdata->matCDescr,cusparse_scalartype,
2215                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2216     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2217       cudaError_t cerr;
2218       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2219       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2220       mmdata->mmBufferSize = mmBufferSize;
2221     }
2222     mmdata->initialized = PETSC_TRUE;
2223   } else {
2224     /* to be safe, always update pointers of the mats */
2225     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2226     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2227     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2228   }
2229 
2230   /* do cusparseSpMM, which supports transpose on B */
2231   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2232                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2233                       mmdata->matCDescr,cusparse_scalartype,
2234                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2235  #else
2236   PetscInt k;
2237   /* cusparseXcsrmm does not support transpose on B */
2238   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2239     cublasHandle_t cublasv2handle;
2240     cublasStatus_t cerr;
2241 
2242     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2243     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2244                        B->cmap->n,B->rmap->n,
2245                        &PETSC_CUSPARSE_ONE ,barray,blda,
2246                        &PETSC_CUSPARSE_ZERO,barray,blda,
2247                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2248     blda = B->cmap->n;
2249     k    = B->cmap->n;
2250   } else {
2251     k    = B->rmap->n;
2252   }
2253 
2254   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2255   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2256                            csrmat->num_entries,mat->alpha_one,mat->descr,
2257                            csrmat->values->data().get(),
2258                            csrmat->row_offsets->data().get(),
2259                            csrmat->column_indices->data().get(),
2260                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2261                            carray,clda);CHKERRCUSPARSE(stat);
2262  #endif
2263   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2264   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2265   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2266   if (product->type == MATPRODUCT_RARt) {
2267     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2268     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2269   } else if (product->type == MATPRODUCT_PtAP) {
2270     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2271     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2272   } else {
2273     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2274   }
2275   if (mmdata->cisdense) {
2276     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2277   }
2278   if (!biscuda) {
2279     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2280   }
2281   PetscFunctionReturn(0);
2282 }
2283 
2284 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2285 {
2286   Mat_Product        *product = C->product;
2287   Mat                A,B;
2288   PetscInt           m,n;
2289   PetscBool          cisdense,flg;
2290   PetscErrorCode     ierr;
2291   MatMatCusparse     *mmdata;
2292   Mat_SeqAIJCUSPARSE *cusp;
2293 
2294   PetscFunctionBegin;
2295   MatCheckProduct(C,1);
2296   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2297   A    = product->A;
2298   B    = product->B;
2299   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2300   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2301   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2302   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2303   switch (product->type) {
2304   case MATPRODUCT_AB:
2305     m = A->rmap->n;
2306     n = B->cmap->n;
2307     break;
2308   case MATPRODUCT_AtB:
2309     m = A->cmap->n;
2310     n = B->cmap->n;
2311     break;
2312   case MATPRODUCT_ABt:
2313     m = A->rmap->n;
2314     n = B->rmap->n;
2315     break;
2316   case MATPRODUCT_PtAP:
2317     m = B->cmap->n;
2318     n = B->cmap->n;
2319     break;
2320   case MATPRODUCT_RARt:
2321     m = B->rmap->n;
2322     n = B->rmap->n;
2323     break;
2324   default:
2325     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2326   }
2327   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2328   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2329   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2330   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2331 
2332   /* product data */
2333   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2334   mmdata->cisdense = cisdense;
2335  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2336   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2337   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2338     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2339   }
2340  #endif
2341   /* for these products we need intermediate storage */
2342   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2343     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2344     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2345     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2346       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2347     } else {
2348       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2349     }
2350   }
2351   C->product->data    = mmdata;
2352   C->product->destroy = MatDestroy_MatMatCusparse;
2353 
2354   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2355   PetscFunctionReturn(0);
2356 }
2357 
2358 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2359 {
2360   Mat_Product                  *product = C->product;
2361   Mat                          A,B;
2362   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2363   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2364   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2365   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2366   PetscBool                    flg;
2367   PetscErrorCode               ierr;
2368   cusparseStatus_t             stat;
2369   cudaError_t                  cerr;
2370   MatProductType               ptype;
2371   MatMatCusparse               *mmdata;
2372 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2373   cusparseSpMatDescr_t         BmatSpDescr;
2374 #endif
2375   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2376 
2377   PetscFunctionBegin;
2378   MatCheckProduct(C,1);
2379   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2380   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2381   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2382   mmdata = (MatMatCusparse*)C->product->data;
2383   A = product->A;
2384   B = product->B;
2385   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2386     mmdata->reusesym = PETSC_FALSE;
2387     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2388     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2389     Cmat = Ccusp->mat;
2390     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2391     Ccsr = (CsrMatrix*)Cmat->mat;
2392     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2393     goto finalize;
2394   }
2395   if (!c->nz) goto finalize;
2396   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2397   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2398   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2399   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2400   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2401   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2402   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2403   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2404   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2405   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2406   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2407   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2408   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2409   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2410 
2411   ptype = product->type;
2412   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2413     ptype = MATPRODUCT_AB;
2414     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2415   }
2416   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2417     ptype = MATPRODUCT_AB;
2418     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2419   }
2420   switch (ptype) {
2421   case MATPRODUCT_AB:
2422     Amat = Acusp->mat;
2423     Bmat = Bcusp->mat;
2424     break;
2425   case MATPRODUCT_AtB:
2426     Amat = Acusp->matTranspose;
2427     Bmat = Bcusp->mat;
2428     break;
2429   case MATPRODUCT_ABt:
2430     Amat = Acusp->mat;
2431     Bmat = Bcusp->matTranspose;
2432     break;
2433   default:
2434     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2435   }
2436   Cmat = Ccusp->mat;
2437   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2438   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2439   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2440   Acsr = (CsrMatrix*)Amat->mat;
2441   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2442   Ccsr = (CsrMatrix*)Cmat->mat;
2443   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2444   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2445   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2446   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2448   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2449   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2450   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2451     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2452                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2453                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2454                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2455   #else
2456     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2457                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2458                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2459                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2460     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2461                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2462                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2463   #endif
2464 #else
2465   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2466                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2467                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2468                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2469                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2470 #endif
2471   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2472   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2473   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2474   C->offloadmask = PETSC_OFFLOAD_GPU;
2475 finalize:
2476   /* shorter version of MatAssemblyEnd_SeqAIJ */
2477   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2478   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2479   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2480   c->reallocs         = 0;
2481   C->info.mallocs    += 0;
2482   C->info.nz_unneeded = 0;
2483   C->assembled = C->was_assembled = PETSC_TRUE;
2484   C->num_ass++;
2485   PetscFunctionReturn(0);
2486 }
2487 
2488 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2489 {
2490   Mat_Product                  *product = C->product;
2491   Mat                          A,B;
2492   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2493   Mat_SeqAIJ                   *a,*b,*c;
2494   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2495   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2496   PetscInt                     i,j,m,n,k;
2497   PetscBool                    flg;
2498   PetscErrorCode               ierr;
2499   cusparseStatus_t             stat;
2500   cudaError_t                  cerr;
2501   MatProductType               ptype;
2502   MatMatCusparse               *mmdata;
2503   PetscLogDouble               flops;
2504   PetscBool                    biscompressed,ciscompressed;
2505 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2506   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2507   cusparseSpMatDescr_t         BmatSpDescr;
2508 #else
2509   int                          cnz;
2510 #endif
2511   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2512 
2513   PetscFunctionBegin;
2514   MatCheckProduct(C,1);
2515   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2516   A    = product->A;
2517   B    = product->B;
2518   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2519   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2520   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2521   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2522   a = (Mat_SeqAIJ*)A->data;
2523   b = (Mat_SeqAIJ*)B->data;
2524   /* product data */
2525   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2526   C->product->data    = mmdata;
2527   C->product->destroy = MatDestroy_MatMatCusparse;
2528 
2529   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2530   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2531   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2532   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2533   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2534   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2535 
2536   ptype = product->type;
2537   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2538     ptype = MATPRODUCT_AB;
2539     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2540   }
2541   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2542     ptype = MATPRODUCT_AB;
2543     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2544   }
2545   biscompressed = PETSC_FALSE;
2546   ciscompressed = PETSC_FALSE;
2547   switch (ptype) {
2548   case MATPRODUCT_AB:
2549     m = A->rmap->n;
2550     n = B->cmap->n;
2551     k = A->cmap->n;
2552     Amat = Acusp->mat;
2553     Bmat = Bcusp->mat;
2554     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2555     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2556     break;
2557   case MATPRODUCT_AtB:
2558     m = A->cmap->n;
2559     n = B->cmap->n;
2560     k = A->rmap->n;
2561     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2562     Amat = Acusp->matTranspose;
2563     Bmat = Bcusp->mat;
2564     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2565     break;
2566   case MATPRODUCT_ABt:
2567     m = A->rmap->n;
2568     n = B->rmap->n;
2569     k = A->cmap->n;
2570     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2571     Amat = Acusp->mat;
2572     Bmat = Bcusp->matTranspose;
2573     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2574     break;
2575   default:
2576     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2577   }
2578 
2579   /* create cusparse matrix */
2580   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2581   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2582   c     = (Mat_SeqAIJ*)C->data;
2583   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2584   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2585   Ccsr  = new CsrMatrix;
2586 
2587   c->compressedrow.use = ciscompressed;
2588   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2589     c->compressedrow.nrows = a->compressedrow.nrows;
2590     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2591     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2592     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2593     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2594     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2595   } else {
2596     c->compressedrow.nrows  = 0;
2597     c->compressedrow.i      = NULL;
2598     c->compressedrow.rindex = NULL;
2599     Ccusp->workVector       = NULL;
2600     Cmat->cprowIndices      = NULL;
2601   }
2602   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2603   Ccusp->mat      = Cmat;
2604   Ccusp->mat->mat = Ccsr;
2605   Ccsr->num_rows    = Ccusp->nrows;
2606   Ccsr->num_cols    = n;
2607   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2608   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2609   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2610   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2611   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2612   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2613   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2614   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2615   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2616   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2617   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2618     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2619     c->nz = 0;
2620     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2621     Ccsr->values = new THRUSTARRAY(c->nz);
2622     goto finalizesym;
2623   }
2624 
2625   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2626   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2627   Acsr = (CsrMatrix*)Amat->mat;
2628   if (!biscompressed) {
2629     Bcsr = (CsrMatrix*)Bmat->mat;
2630 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2631     BmatSpDescr = Bmat->matDescr;
2632 #endif
2633   } else { /* we need to use row offsets for the full matrix */
2634     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2635     Bcsr = new CsrMatrix;
2636     Bcsr->num_rows       = B->rmap->n;
2637     Bcsr->num_cols       = cBcsr->num_cols;
2638     Bcsr->num_entries    = cBcsr->num_entries;
2639     Bcsr->column_indices = cBcsr->column_indices;
2640     Bcsr->values         = cBcsr->values;
2641     if (!Bcusp->rowoffsets_gpu) {
2642       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2643       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2644       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2645     }
2646     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2647     mmdata->Bcsr = Bcsr;
2648 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2649     if (Bcsr->num_rows && Bcsr->num_cols) {
2650       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2651                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2652                                Bcsr->values->data().get(),
2653                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2654                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2655     }
2656     BmatSpDescr = mmdata->matSpBDescr;
2657 #endif
2658   }
2659   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2660   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2661   /* precompute flops count */
2662   if (ptype == MATPRODUCT_AB) {
2663     for (i=0, flops = 0; i<A->rmap->n; i++) {
2664       const PetscInt st = a->i[i];
2665       const PetscInt en = a->i[i+1];
2666       for (j=st; j<en; j++) {
2667         const PetscInt brow = a->j[j];
2668         flops += 2.*(b->i[brow+1] - b->i[brow]);
2669       }
2670     }
2671   } else if (ptype == MATPRODUCT_AtB) {
2672     for (i=0, flops = 0; i<A->rmap->n; i++) {
2673       const PetscInt anzi = a->i[i+1] - a->i[i];
2674       const PetscInt bnzi = b->i[i+1] - b->i[i];
2675       flops += (2.*anzi)*bnzi;
2676     }
2677   } else { /* TODO */
2678     flops = 0.;
2679   }
2680 
2681   mmdata->flops = flops;
2682   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2683 
2684 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2685   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2686   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2687                           NULL, NULL, NULL,
2688                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2689                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2690   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2691  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2692  {
2693   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2694      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2695   */
2696   void*  dBuffer1 = NULL;
2697   void*  dBuffer2 = NULL;
2698   void*  dBuffer3 = NULL;
2699   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2700   size_t bufferSize1 = 0;
2701   size_t bufferSize2 = 0;
2702   size_t bufferSize3 = 0;
2703   size_t bufferSize4 = 0;
2704   size_t bufferSize5 = 0;
2705 
2706   /*----------------------------------------------------------------------*/
2707   /* ask bufferSize1 bytes for external memory */
2708   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2709                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2710                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2711   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2712   /* inspect the matrices A and B to understand the memory requirement for the next step */
2713   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2714                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2715                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2716 
2717   /*----------------------------------------------------------------------*/
2718   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2719                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2720                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2721   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2722   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2723   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2724   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2725                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2726                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2727   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2728   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2729 
2730   /*----------------------------------------------------------------------*/
2731   /* get matrix C non-zero entries C_nnz1 */
2732   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2733   c->nz = (PetscInt) C_nnz1;
2734   /* allocate matrix C */
2735   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2737   /* update matC with the new pointers */
2738   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2739                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2740 
2741   /*----------------------------------------------------------------------*/
2742   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2743                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2744                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2745   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2746   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2747                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2748                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2749   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2750   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2751                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2752                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2753                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2754   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2755  }
2756  #else
2757   size_t bufSize2;
2758   /* ask bufferSize bytes for external memory */
2759   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2760                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2761                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2762                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2763   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2764   /* inspect the matrices A and B to understand the memory requirement for the next step */
2765   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2766                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2767                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2768                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2769   /* ask bufferSize again bytes for external memory */
2770   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2771                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2772                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2773                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2774   /* The CUSPARSE documentation is not clear, nor the API
2775      We need both buffers to perform the operations properly!
2776      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2777      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2778      is stored in the descriptor! What a messy API... */
2779   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2780   /* compute the intermediate product of A * B */
2781   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2782                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2783                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2784                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2785   /* get matrix C non-zero entries C_nnz1 */
2786   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2787   c->nz = (PetscInt) C_nnz1;
2788   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2789   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2790   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2791   Ccsr->values = new THRUSTARRAY(c->nz);
2792   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2793   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2794                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2795   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2796                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2797                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2798  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2799 #else
2800   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2801   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2802                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2803                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2804                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2805                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2806   c->nz = cnz;
2807   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2808   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2809   Ccsr->values = new THRUSTARRAY(c->nz);
2810   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2811 
2812   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2813   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2814      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2815      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2816   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2817                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2818                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2819                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2820                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2821 #endif
2822   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2823   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2824 finalizesym:
2825   c->singlemalloc = PETSC_FALSE;
2826   c->free_a       = PETSC_TRUE;
2827   c->free_ij      = PETSC_TRUE;
2828   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2829   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2830   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2831     PetscInt *d_i = c->i;
2832     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2833     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2834     ii   = *Ccsr->row_offsets;
2835     jj   = *Ccsr->column_indices;
2836     if (ciscompressed) d_i = c->compressedrow.i;
2837     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2838     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839   } else {
2840     PetscInt *d_i = c->i;
2841     if (ciscompressed) d_i = c->compressedrow.i;
2842     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2843     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2844   }
2845   if (ciscompressed) { /* need to expand host row offsets */
2846     PetscInt r = 0;
2847     c->i[0] = 0;
2848     for (k = 0; k < c->compressedrow.nrows; k++) {
2849       const PetscInt next = c->compressedrow.rindex[k];
2850       const PetscInt old = c->compressedrow.i[k];
2851       for (; r < next; r++) c->i[r+1] = old;
2852     }
2853     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2854   }
2855   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2856   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2857   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2858   c->maxnz = c->nz;
2859   c->nonzerorowcnt = 0;
2860   c->rmax = 0;
2861   for (k = 0; k < m; k++) {
2862     const PetscInt nn = c->i[k+1] - c->i[k];
2863     c->ilen[k] = c->imax[k] = nn;
2864     c->nonzerorowcnt += (PetscInt)!!nn;
2865     c->rmax = PetscMax(c->rmax,nn);
2866   }
2867   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2868   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2869   Ccsr->num_entries = c->nz;
2870 
2871   C->nonzerostate++;
2872   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2873   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2874   Ccusp->nonzerostate = C->nonzerostate;
2875   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2876   C->preallocated  = PETSC_TRUE;
2877   C->assembled     = PETSC_FALSE;
2878   C->was_assembled = PETSC_FALSE;
2879   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2880     mmdata->reusesym = PETSC_TRUE;
2881     C->offloadmask   = PETSC_OFFLOAD_GPU;
2882   }
2883   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2884   PetscFunctionReturn(0);
2885 }
2886 
2887 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2888 
2889 /* handles sparse or dense B */
2890 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2891 {
2892   Mat_Product    *product = mat->product;
2893   PetscErrorCode ierr;
2894   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2895 
2896   PetscFunctionBegin;
2897   MatCheckProduct(mat,1);
2898   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2899   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2900     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2901   }
2902   if (product->type == MATPRODUCT_ABC) {
2903     Ciscusp = PETSC_FALSE;
2904     if (!product->C->boundtocpu) {
2905       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2906     }
2907   }
2908   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2909     PetscBool usecpu = PETSC_FALSE;
2910     switch (product->type) {
2911     case MATPRODUCT_AB:
2912       if (product->api_user) {
2913         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2914         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2915         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2916       } else {
2917         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2918         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2919         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2920       }
2921       break;
2922     case MATPRODUCT_AtB:
2923       if (product->api_user) {
2924         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2925         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2926         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2927       } else {
2928         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2929         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2930         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2931       }
2932       break;
2933     case MATPRODUCT_PtAP:
2934       if (product->api_user) {
2935         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2936         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2937         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2938       } else {
2939         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2940         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2941         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2942       }
2943       break;
2944     case MATPRODUCT_RARt:
2945       if (product->api_user) {
2946         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2947         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2948         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2949       } else {
2950         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2951         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2952         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2953       }
2954       break;
2955     case MATPRODUCT_ABC:
2956       if (product->api_user) {
2957         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2958         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2959         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2960       } else {
2961         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2962         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2963         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2964       }
2965       break;
2966     default:
2967       break;
2968     }
2969     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2970   }
2971   /* dispatch */
2972   if (isdense) {
2973     switch (product->type) {
2974     case MATPRODUCT_AB:
2975     case MATPRODUCT_AtB:
2976     case MATPRODUCT_ABt:
2977     case MATPRODUCT_PtAP:
2978     case MATPRODUCT_RARt:
2979      if (product->A->boundtocpu) {
2980         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2981       } else {
2982         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2983       }
2984       break;
2985     case MATPRODUCT_ABC:
2986       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2987       break;
2988     default:
2989       break;
2990     }
2991   } else if (Biscusp && Ciscusp) {
2992     switch (product->type) {
2993     case MATPRODUCT_AB:
2994     case MATPRODUCT_AtB:
2995     case MATPRODUCT_ABt:
2996       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2997       break;
2998     case MATPRODUCT_PtAP:
2999     case MATPRODUCT_RARt:
3000     case MATPRODUCT_ABC:
3001       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3002       break;
3003     default:
3004       break;
3005     }
3006   } else { /* fallback for AIJ */
3007     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3008   }
3009   PetscFunctionReturn(0);
3010 }
3011 
3012 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3013 {
3014   PetscErrorCode ierr;
3015 
3016   PetscFunctionBegin;
3017   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3018   PetscFunctionReturn(0);
3019 }
3020 
3021 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3022 {
3023   PetscErrorCode ierr;
3024 
3025   PetscFunctionBegin;
3026   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3027   PetscFunctionReturn(0);
3028 }
3029 
3030 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3031 {
3032   PetscErrorCode ierr;
3033 
3034   PetscFunctionBegin;
3035   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3036   PetscFunctionReturn(0);
3037 }
3038 
3039 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3040 {
3041   PetscErrorCode ierr;
3042 
3043   PetscFunctionBegin;
3044   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3045   PetscFunctionReturn(0);
3046 }
3047 
3048 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3049 {
3050   PetscErrorCode ierr;
3051 
3052   PetscFunctionBegin;
3053   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3054   PetscFunctionReturn(0);
3055 }
3056 
3057 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3058 {
3059   int i = blockIdx.x*blockDim.x + threadIdx.x;
3060   if (i < n) y[idx[i]] += x[i];
3061 }
3062 
3063 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3064 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3065 {
3066   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3067   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3068   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3069   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3070   PetscErrorCode               ierr;
3071   cusparseStatus_t             stat;
3072   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3073   PetscBool                    compressed;
3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3075   PetscInt                     nx,ny;
3076 #endif
3077 
3078   PetscFunctionBegin;
3079   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3080   if (!a->nonzerorowcnt) {
3081     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3082     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3083     PetscFunctionReturn(0);
3084   }
3085   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3086   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3087   if (!trans) {
3088     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3089     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3090   } else {
3091     if (herm || !A->form_explicit_transpose) {
3092       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3093       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3094     } else {
3095       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3096       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3097     }
3098   }
3099   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3100   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3101 
3102   try {
3103     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3104     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3105     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3106 
3107     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3108     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3109       /* z = A x + beta y.
3110          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3111          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3112       */
3113       xptr = xarray;
3114       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3115       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3116      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3117       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3118           allocated to accommodate different uses. So we get the length info directly from mat.
3119        */
3120       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3121         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3122         nx = mat->num_cols;
3123         ny = mat->num_rows;
3124       }
3125      #endif
3126     } else {
3127       /* z = A^T x + beta y
3128          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3129          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3130        */
3131       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3132       dptr = zarray;
3133       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3134       if (compressed) { /* Scatter x to work vector */
3135         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3136         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3137                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3138                          VecCUDAEqualsReverse());
3139       }
3140      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3141       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3142         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3143         nx = mat->num_rows;
3144         ny = mat->num_cols;
3145       }
3146      #endif
3147     }
3148 
3149     /* csr_spmv does y = alpha op(A) x + beta y */
3150     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3151      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3152       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3153       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3154         cudaError_t cerr;
3155         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3156         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3157         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3158                                 matstruct->matDescr,
3159                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3160                                 matstruct->cuSpMV[opA].vecYDescr,
3161                                 cusparse_scalartype,
3162                                 cusparsestruct->spmvAlg,
3163                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3164         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3165 
3166         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3167       } else {
3168         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3169         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3170         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3171       }
3172 
3173       stat = cusparseSpMV(cusparsestruct->handle, opA,
3174                                matstruct->alpha_one,
3175                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3176                                matstruct->cuSpMV[opA].vecXDescr,
3177                                beta,
3178                                matstruct->cuSpMV[opA].vecYDescr,
3179                                cusparse_scalartype,
3180                                cusparsestruct->spmvAlg,
3181                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3182      #else
3183       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3184       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3185                                mat->num_rows, mat->num_cols,
3186                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3187                                mat->values->data().get(), mat->row_offsets->data().get(),
3188                                mat->column_indices->data().get(), xptr, beta,
3189                                dptr);CHKERRCUSPARSE(stat);
3190      #endif
3191     } else {
3192       if (cusparsestruct->nrows) {
3193        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3194         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3195        #else
3196         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3197         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3198                                  matstruct->alpha_one, matstruct->descr, hybMat,
3199                                  xptr, beta,
3200                                  dptr);CHKERRCUSPARSE(stat);
3201        #endif
3202       }
3203     }
3204     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3205 
3206     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3207       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3208         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3209           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3210         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3211           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3212         }
3213       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3214         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3215       }
3216 
3217       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3218       if (compressed) {
3219         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3220         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3221            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3222            prevent that. So I just add a ScatterAdd kernel.
3223          */
3224        #if 0
3225         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3226         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3227                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3228                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3229                          VecCUDAPlusEquals());
3230        #else
3231         PetscInt n = matstruct->cprowIndices->size();
3232         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3233        #endif
3234         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3235       }
3236     } else {
3237       if (yy && yy != zz) {
3238         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3239       }
3240     }
3241     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3242     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3243     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3244   } catch(char *ex) {
3245     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3246   }
3247   if (yy) {
3248     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3249   } else {
3250     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3251   }
3252   PetscFunctionReturn(0);
3253 }
3254 
3255 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3256 {
3257   PetscErrorCode ierr;
3258 
3259   PetscFunctionBegin;
3260   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3261   PetscFunctionReturn(0);
3262 }
3263 
3264 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3265 {
3266   PetscErrorCode     ierr;
3267   PetscObjectState   onnz = A->nonzerostate;
3268   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3269 
3270   PetscFunctionBegin;
3271   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3272   if (onnz != A->nonzerostate && cusp->deviceMat) {
3273     cudaError_t cerr;
3274 
3275     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3276     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3277     cusp->deviceMat = NULL;
3278   }
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 /* --------------------------------------------------------------------------------*/
3283 /*@
3284    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3285    (the default parallel PETSc format). This matrix will ultimately pushed down
3286    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3287    assembly performance the user should preallocate the matrix storage by setting
3288    the parameter nz (or the array nnz).  By setting these parameters accurately,
3289    performance during matrix assembly can be increased by more than a factor of 50.
3290 
3291    Collective
3292 
3293    Input Parameters:
3294 +  comm - MPI communicator, set to PETSC_COMM_SELF
3295 .  m - number of rows
3296 .  n - number of columns
3297 .  nz - number of nonzeros per row (same for all rows)
3298 -  nnz - array containing the number of nonzeros in the various rows
3299          (possibly different for each row) or NULL
3300 
3301    Output Parameter:
3302 .  A - the matrix
3303 
3304    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3305    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3306    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3307 
3308    Notes:
3309    If nnz is given then nz is ignored
3310 
3311    The AIJ format (also called the Yale sparse matrix format or
3312    compressed row storage), is fully compatible with standard Fortran 77
3313    storage.  That is, the stored row and column indices can begin at
3314    either one (as in Fortran) or zero.  See the users' manual for details.
3315 
3316    Specify the preallocated storage with either nz or nnz (not both).
3317    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3318    allocation.  For large problems you MUST preallocate memory or you
3319    will get TERRIBLE performance, see the users' manual chapter on matrices.
3320 
3321    By default, this format uses inodes (identical nodes) when possible, to
3322    improve numerical efficiency of matrix-vector products and solves. We
3323    search for consecutive rows with the same nonzero structure, thereby
3324    reusing matrix information to achieve increased efficiency.
3325 
3326    Level: intermediate
3327 
3328 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3329 @*/
3330 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3331 {
3332   PetscErrorCode ierr;
3333 
3334   PetscFunctionBegin;
3335   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3336   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3337   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3338   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3343 {
3344   PetscErrorCode ierr;
3345 
3346   PetscFunctionBegin;
3347   if (A->factortype == MAT_FACTOR_NONE) {
3348     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3349   } else {
3350     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3351   }
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3357   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3358   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3359   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3360   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3361   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3362   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3363   PetscFunctionReturn(0);
3364 }
3365 
3366 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3367 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3368 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3369 {
3370   PetscErrorCode ierr;
3371 
3372   PetscFunctionBegin;
3373   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3374   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3375   PetscFunctionReturn(0);
3376 }
3377 
3378 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3379 {
3380   PetscErrorCode     ierr;
3381   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3382   Mat_SeqAIJCUSPARSE *cy;
3383   Mat_SeqAIJCUSPARSE *cx;
3384   PetscScalar        *ay;
3385   const PetscScalar  *ax;
3386   CsrMatrix          *csry,*csrx;
3387 
3388   PetscFunctionBegin;
3389   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3390   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3391   if (X->ops->axpy != Y->ops->axpy) {
3392     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3393     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3394     PetscFunctionReturn(0);
3395   }
3396   /* if we are here, it means both matrices are bound to GPU */
3397   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3398   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3399   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3400   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3401   csry = (CsrMatrix*)cy->mat->mat;
3402   csrx = (CsrMatrix*)cx->mat->mat;
3403   /* see if we can turn this into a cublas axpy */
3404   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3405     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3406     if (eq) {
3407       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3408     }
3409     if (eq) str = SAME_NONZERO_PATTERN;
3410   }
3411   /* spgeam is buggy with one column */
3412   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3413 
3414   if (str == SUBSET_NONZERO_PATTERN) {
3415     cusparseStatus_t stat;
3416     PetscScalar      b = 1.0;
3417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3418     size_t           bufferSize;
3419     void             *buffer;
3420     cudaError_t      cerr;
3421 #endif
3422 
3423     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3424     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3425     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3426 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3427     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3428                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3429                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3430                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3431     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3432     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3433     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3434                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3435                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3436                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3437     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3438     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3439     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3440 #else
3441     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3442     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3443                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3444                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3445                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3446     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3447     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3448 #endif
3449     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3450     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3451     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3452     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3453   } else if (str == SAME_NONZERO_PATTERN) {
3454     cublasHandle_t cublasv2handle;
3455     cublasStatus_t berr;
3456     PetscBLASInt   one = 1, bnz = 1;
3457 
3458     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3459     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3460     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3461     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3462     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3463     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3464     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3465     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3466     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3467     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3468     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3469   } else {
3470     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3471     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3472   }
3473   PetscFunctionReturn(0);
3474 }
3475 
3476 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3477 {
3478   PetscErrorCode ierr;
3479   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3480   PetscScalar    *ay;
3481   cublasHandle_t cublasv2handle;
3482   cublasStatus_t berr;
3483   PetscBLASInt   one = 1, bnz = 1;
3484 
3485   PetscFunctionBegin;
3486   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3487   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3488   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3489   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3490   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3491   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3492   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3493   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3494   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3495   PetscFunctionReturn(0);
3496 }
3497 
3498 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3499 {
3500   PetscErrorCode ierr;
3501   PetscBool      both = PETSC_FALSE;
3502   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3503 
3504   PetscFunctionBegin;
3505   if (A->factortype == MAT_FACTOR_NONE) {
3506     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3507     if (spptr->mat) {
3508       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3509       if (matrix->values) {
3510         both = PETSC_TRUE;
3511         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3512       }
3513     }
3514     if (spptr->matTranspose) {
3515       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3516       if (matrix->values) {
3517         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3518       }
3519     }
3520   }
3521   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3522   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3523   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3524   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3525   else A->offloadmask = PETSC_OFFLOAD_CPU;
3526   PetscFunctionReturn(0);
3527 }
3528 
3529 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3530 {
3531   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3532   PetscErrorCode ierr;
3533 
3534   PetscFunctionBegin;
3535   if (A->factortype != MAT_FACTOR_NONE) {
3536     A->boundtocpu = flg;
3537     PetscFunctionReturn(0);
3538   }
3539   if (flg) {
3540     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3541 
3542     A->ops->scale                     = MatScale_SeqAIJ;
3543     A->ops->axpy                      = MatAXPY_SeqAIJ;
3544     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3545     A->ops->mult                      = MatMult_SeqAIJ;
3546     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3547     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3548     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3549     A->ops->multhermitiantranspose    = NULL;
3550     A->ops->multhermitiantransposeadd = NULL;
3551     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3552     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3553     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3554     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3555     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3556     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3557     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3558     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3559     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3560   } else {
3561     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3562     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3563     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3564     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3565     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3566     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3567     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3568     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3569     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3570     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3571     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3572     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3573     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3574     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3575     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3576     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3581     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3582     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3583   }
3584   A->boundtocpu = flg;
3585   if (flg && a->inode.size) {
3586     a->inode.use = PETSC_TRUE;
3587   } else {
3588     a->inode.use = PETSC_FALSE;
3589   }
3590   PetscFunctionReturn(0);
3591 }
3592 
3593 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3594 {
3595   PetscErrorCode   ierr;
3596   cusparseStatus_t stat;
3597   Mat              B;
3598 
3599   PetscFunctionBegin;
3600   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3601   if (reuse == MAT_INITIAL_MATRIX) {
3602     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3603   } else if (reuse == MAT_REUSE_MATRIX) {
3604     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3605   }
3606   B = *newmat;
3607 
3608   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3609   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3610 
3611   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3612     if (B->factortype == MAT_FACTOR_NONE) {
3613       Mat_SeqAIJCUSPARSE *spptr;
3614       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3615       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3616       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3617       spptr->format     = MAT_CUSPARSE_CSR;
3618      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3619      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3620       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3621      #else
3622       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3623      #endif
3624       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3625       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3626      #endif
3627       B->spptr = spptr;
3628     } else {
3629       Mat_SeqAIJCUSPARSETriFactors *spptr;
3630 
3631       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3632       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3633       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3634       B->spptr = spptr;
3635     }
3636     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3637   }
3638   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3639   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3640   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3641   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3642   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3643   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3644 
3645   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3646   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3647   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3648 #if defined(PETSC_HAVE_HYPRE)
3649   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3650 #endif
3651   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3652   PetscFunctionReturn(0);
3653 }
3654 
3655 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3656 {
3657   PetscErrorCode ierr;
3658 
3659   PetscFunctionBegin;
3660   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3661   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3662   PetscFunctionReturn(0);
3663 }
3664 
3665 /*MC
3666    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3667 
3668    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3669    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3670    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3671 
3672    Options Database Keys:
3673 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3674 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3675 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3676 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3677 
3678   Level: beginner
3679 
3680 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3681 M*/
3682 
3683 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3684 
3685 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3686 {
3687   PetscErrorCode ierr;
3688 
3689   PetscFunctionBegin;
3690   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3691   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3692   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3693   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3694   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3695 
3696   PetscFunctionReturn(0);
3697 }
3698 
3699 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3700 {
3701   PetscErrorCode   ierr;
3702   cusparseStatus_t stat;
3703 
3704   PetscFunctionBegin;
3705   if (*cusparsestruct) {
3706     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3707     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3708     delete (*cusparsestruct)->workVector;
3709     delete (*cusparsestruct)->rowoffsets_gpu;
3710     delete (*cusparsestruct)->cooPerm;
3711     delete (*cusparsestruct)->cooPerm_a;
3712     delete (*cusparsestruct)->csr2csc_i;
3713     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3714     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3715   }
3716   PetscFunctionReturn(0);
3717 }
3718 
3719 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3720 {
3721   PetscFunctionBegin;
3722   if (*mat) {
3723     delete (*mat)->values;
3724     delete (*mat)->column_indices;
3725     delete (*mat)->row_offsets;
3726     delete *mat;
3727     *mat = 0;
3728   }
3729   PetscFunctionReturn(0);
3730 }
3731 
3732 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3733 {
3734   cusparseStatus_t stat;
3735   PetscErrorCode   ierr;
3736 
3737   PetscFunctionBegin;
3738   if (*trifactor) {
3739     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3740     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3741     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3742     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3743     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3744    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3745     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3746    #endif
3747     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3748   }
3749   PetscFunctionReturn(0);
3750 }
3751 
3752 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3753 {
3754   CsrMatrix        *mat;
3755   cusparseStatus_t stat;
3756   cudaError_t      err;
3757 
3758   PetscFunctionBegin;
3759   if (*matstruct) {
3760     if ((*matstruct)->mat) {
3761       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3762        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3763         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3764        #else
3765         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3766         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3767        #endif
3768       } else {
3769         mat = (CsrMatrix*)(*matstruct)->mat;
3770         CsrMatrix_Destroy(&mat);
3771       }
3772     }
3773     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3774     delete (*matstruct)->cprowIndices;
3775     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3776     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3777     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3778 
3779    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3780     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3781     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3782     for (int i=0; i<3; i++) {
3783       if (mdata->cuSpMV[i].initialized) {
3784         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3785         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3786         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3787       }
3788     }
3789    #endif
3790     delete *matstruct;
3791     *matstruct = NULL;
3792   }
3793   PetscFunctionReturn(0);
3794 }
3795 
3796 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3797 {
3798   PetscErrorCode ierr;
3799 
3800   PetscFunctionBegin;
3801   if (*trifactors) {
3802     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3803     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3804     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3805     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3806     delete (*trifactors)->rpermIndices;
3807     delete (*trifactors)->cpermIndices;
3808     delete (*trifactors)->workVector;
3809     (*trifactors)->rpermIndices = NULL;
3810     (*trifactors)->cpermIndices = NULL;
3811     (*trifactors)->workVector = NULL;
3812     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3813     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3814     (*trifactors)->init_dev_prop = PETSC_FALSE;
3815   }
3816   PetscFunctionReturn(0);
3817 }
3818 
3819 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3820 {
3821   PetscErrorCode   ierr;
3822   cusparseHandle_t handle;
3823   cusparseStatus_t stat;
3824 
3825   PetscFunctionBegin;
3826   if (*trifactors) {
3827     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3828     if (handle = (*trifactors)->handle) {
3829       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3830     }
3831     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3832   }
3833   PetscFunctionReturn(0);
3834 }
3835 
3836 struct IJCompare
3837 {
3838   __host__ __device__
3839   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3840   {
3841     if (t1.get<0>() < t2.get<0>()) return true;
3842     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3843     return false;
3844   }
3845 };
3846 
3847 struct IJEqual
3848 {
3849   __host__ __device__
3850   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3851   {
3852     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3853     return true;
3854   }
3855 };
3856 
3857 struct IJDiff
3858 {
3859   __host__ __device__
3860   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3861   {
3862     return t1 == t2 ? 0 : 1;
3863   }
3864 };
3865 
3866 struct IJSum
3867 {
3868   __host__ __device__
3869   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3870   {
3871     return t1||t2;
3872   }
3873 };
3874 
3875 #include <thrust/iterator/discard_iterator.h>
3876 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3877 {
3878   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3879   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3880   THRUSTARRAY                           *cooPerm_v = NULL;
3881   thrust::device_ptr<const PetscScalar> d_v;
3882   CsrMatrix                             *matrix;
3883   PetscErrorCode                        ierr;
3884   PetscInt                              n;
3885 
3886   PetscFunctionBegin;
3887   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3888   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3889   if (!cusp->cooPerm) {
3890     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3891     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3892     PetscFunctionReturn(0);
3893   }
3894   matrix = (CsrMatrix*)cusp->mat->mat;
3895   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3896   if (!v) {
3897     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3898     goto finalize;
3899   }
3900   n = cusp->cooPerm->size();
3901   if (isCudaMem(v)) {
3902     d_v = thrust::device_pointer_cast(v);
3903   } else {
3904     cooPerm_v = new THRUSTARRAY(n);
3905     cooPerm_v->assign(v,v+n);
3906     d_v = cooPerm_v->data();
3907     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3908   }
3909   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3910   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3911     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3912       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3913       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3914       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3915         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3916         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3917       */
3918       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3919       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3920       delete cooPerm_w;
3921     } else {
3922       /* all nonzeros in d_v[] are unique entries */
3923       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3924                                                                 matrix->values->begin()));
3925       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3926                                                                 matrix->values->end()));
3927       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3928     }
3929   } else {
3930     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3931       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3932       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3933     } else {
3934       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3935                                                                 matrix->values->begin()));
3936       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3937                                                                 matrix->values->end()));
3938       thrust::for_each(zibit,zieit,VecCUDAEquals());
3939     }
3940   }
3941   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3942 finalize:
3943   delete cooPerm_v;
3944   A->offloadmask = PETSC_OFFLOAD_GPU;
3945   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3946   /* shorter version of MatAssemblyEnd_SeqAIJ */
3947   ierr = PetscInfo3(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3948   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3949   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3950   a->reallocs         = 0;
3951   A->info.mallocs    += 0;
3952   A->info.nz_unneeded = 0;
3953   A->assembled = A->was_assembled = PETSC_TRUE;
3954   A->num_ass++;
3955   PetscFunctionReturn(0);
3956 }
3957 
3958 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3959 {
3960   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3961   PetscErrorCode     ierr;
3962 
3963   PetscFunctionBegin;
3964   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3965   if (!cusp) PetscFunctionReturn(0);
3966   if (destroy) {
3967     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3968     delete cusp->csr2csc_i;
3969     cusp->csr2csc_i = NULL;
3970   }
3971   A->transupdated = PETSC_FALSE;
3972   PetscFunctionReturn(0);
3973 }
3974 
3975 #include <thrust/binary_search.h>
3976 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3977 {
3978   PetscErrorCode     ierr;
3979   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3980   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3981   PetscInt           cooPerm_n, nzr = 0;
3982   cudaError_t        cerr;
3983 
3984   PetscFunctionBegin;
3985   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3986   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3987   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3988   if (n != cooPerm_n) {
3989     delete cusp->cooPerm;
3990     delete cusp->cooPerm_a;
3991     cusp->cooPerm = NULL;
3992     cusp->cooPerm_a = NULL;
3993   }
3994   if (n) {
3995     THRUSTINTARRAY d_i(n);
3996     THRUSTINTARRAY d_j(n);
3997     THRUSTINTARRAY ii(A->rmap->n);
3998 
3999     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4000     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4001 
4002     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
4003     d_i.assign(coo_i,coo_i+n);
4004     d_j.assign(coo_j,coo_j+n);
4005 
4006     /* Ex.
4007       n = 6
4008       coo_i = [3,3,1,4,1,4]
4009       coo_j = [3,2,2,5,2,6]
4010     */
4011     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4012     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4013 
4014     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4015     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4016     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4017     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4018     THRUSTINTARRAY w = d_j;
4019 
4020     /*
4021       d_i     = [1,1,3,3,4,4]
4022       d_j     = [2,2,2,3,5,6]
4023       cooPerm = [2,4,1,0,3,5]
4024     */
4025     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4026 
4027     /*
4028       d_i     = [1,3,3,4,4,x]
4029                             ^ekey
4030       d_j     = [2,2,3,5,6,x]
4031                            ^nekye
4032     */
4033     if (nekey == ekey) { /* all entries are unique */
4034       delete cusp->cooPerm_a;
4035       cusp->cooPerm_a = NULL;
4036     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4037       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4038       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4039       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4040       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4041       w[0] = 0;
4042       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4043       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4044     }
4045     thrust::counting_iterator<PetscInt> search_begin(0);
4046     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4047                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4048                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4049     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4050 
4051     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4052     a->singlemalloc = PETSC_FALSE;
4053     a->free_a       = PETSC_TRUE;
4054     a->free_ij      = PETSC_TRUE;
4055     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4056     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4057     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4058     a->nz = a->maxnz = a->i[A->rmap->n];
4059     a->rmax = 0;
4060     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4061     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4062     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4063     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4064     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4065     for (PetscInt i = 0; i < A->rmap->n; i++) {
4066       const PetscInt nnzr = a->i[i+1] - a->i[i];
4067       nzr += (PetscInt)!!(nnzr);
4068       a->ilen[i] = a->imax[i] = nnzr;
4069       a->rmax = PetscMax(a->rmax,nnzr);
4070     }
4071     a->nonzerorowcnt = nzr;
4072     A->preallocated = PETSC_TRUE;
4073     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4074     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4075   } else {
4076     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4077   }
4078   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4079 
4080   /* We want to allocate the CUSPARSE struct for matvec now.
4081      The code is so convoluted now that I prefer to copy zeros */
4082   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4083   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4084   A->offloadmask = PETSC_OFFLOAD_CPU;
4085   A->nonzerostate++;
4086   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4087   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4088 
4089   A->assembled = PETSC_FALSE;
4090   A->was_assembled = PETSC_FALSE;
4091   PetscFunctionReturn(0);
4092 }
4093 
4094 /*@C
4095     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4096 
4097    Not collective
4098 
4099     Input Parameters:
4100 +   A - the matrix
4101 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4102 
4103     Output Parameters:
4104 +   ia - the CSR row pointers
4105 -   ja - the CSR column indices
4106 
4107     Level: developer
4108 
4109     Notes:
4110       When compressed is true, the CSR structure does not contain empty rows
4111 
4112 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4113 @*/
4114 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4115 {
4116   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4117   CsrMatrix          *csr;
4118   PetscErrorCode     ierr;
4119   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4120 
4121   PetscFunctionBegin;
4122   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4123   if (!i || !j) PetscFunctionReturn(0);
4124   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4125   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4126   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4127   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4128   csr = (CsrMatrix*)cusp->mat->mat;
4129   if (i) {
4130     if (!compressed && a->compressedrow.use) { /* need full row offset */
4131       if (!cusp->rowoffsets_gpu) {
4132         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4133         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4134         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4135       }
4136       *i = cusp->rowoffsets_gpu->data().get();
4137     } else *i = csr->row_offsets->data().get();
4138   }
4139   if (j) *j = csr->column_indices->data().get();
4140   PetscFunctionReturn(0);
4141 }
4142 
4143 /*@C
4144     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4145 
4146    Not collective
4147 
4148     Input Parameters:
4149 +   A - the matrix
4150 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4151 
4152     Output Parameters:
4153 +   ia - the CSR row pointers
4154 -   ja - the CSR column indices
4155 
4156     Level: developer
4157 
4158 .seealso: MatSeqAIJCUSPARSEGetIJ()
4159 @*/
4160 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4161 {
4162   PetscFunctionBegin;
4163   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4164   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4165   if (i) *i = NULL;
4166   if (j) *j = NULL;
4167   PetscFunctionReturn(0);
4168 }
4169 
4170 /*@C
4171    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4172 
4173    Not Collective
4174 
4175    Input Parameter:
4176 .   A - a MATSEQAIJCUSPARSE matrix
4177 
4178    Output Parameter:
4179 .   a - pointer to the device data
4180 
4181    Level: developer
4182 
4183    Notes: may trigger host-device copies if up-to-date matrix data is on host
4184 
4185 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4186 @*/
4187 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4188 {
4189   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4190   CsrMatrix          *csr;
4191   PetscErrorCode     ierr;
4192 
4193   PetscFunctionBegin;
4194   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4195   PetscValidPointer(a,2);
4196   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4197   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4198   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4199   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4200   csr = (CsrMatrix*)cusp->mat->mat;
4201   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4202   *a = csr->values->data().get();
4203   PetscFunctionReturn(0);
4204 }
4205 
4206 /*@C
4207    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4208 
4209    Not Collective
4210 
4211    Input Parameter:
4212 .   A - a MATSEQAIJCUSPARSE matrix
4213 
4214    Output Parameter:
4215 .   a - pointer to the device data
4216 
4217    Level: developer
4218 
4219 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4220 @*/
4221 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4222 {
4223   PetscFunctionBegin;
4224   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4225   PetscValidPointer(a,2);
4226   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4227   *a = NULL;
4228   PetscFunctionReturn(0);
4229 }
4230 
4231 /*@C
4232    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4233 
4234    Not Collective
4235 
4236    Input Parameter:
4237 .   A - a MATSEQAIJCUSPARSE matrix
4238 
4239    Output Parameter:
4240 .   a - pointer to the device data
4241 
4242    Level: developer
4243 
4244    Notes: may trigger host-device copies if up-to-date matrix data is on host
4245 
4246 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4247 @*/
4248 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4249 {
4250   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4251   CsrMatrix          *csr;
4252   PetscErrorCode     ierr;
4253 
4254   PetscFunctionBegin;
4255   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4256   PetscValidPointer(a,2);
4257   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4258   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4259   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4260   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4261   csr = (CsrMatrix*)cusp->mat->mat;
4262   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4263   *a = csr->values->data().get();
4264   A->offloadmask = PETSC_OFFLOAD_GPU;
4265   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4266   PetscFunctionReturn(0);
4267 }
4268 /*@C
4269    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4270 
4271    Not Collective
4272 
4273    Input Parameter:
4274 .   A - a MATSEQAIJCUSPARSE matrix
4275 
4276    Output Parameter:
4277 .   a - pointer to the device data
4278 
4279    Level: developer
4280 
4281 .seealso: MatSeqAIJCUSPARSEGetArray()
4282 @*/
4283 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4284 {
4285   PetscErrorCode ierr;
4286 
4287   PetscFunctionBegin;
4288   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4289   PetscValidPointer(a,2);
4290   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4291   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4292   *a = NULL;
4293   PetscFunctionReturn(0);
4294 }
4295 
4296 /*@C
4297    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4298 
4299    Not Collective
4300 
4301    Input Parameter:
4302 .   A - a MATSEQAIJCUSPARSE matrix
4303 
4304    Output Parameter:
4305 .   a - pointer to the device data
4306 
4307    Level: developer
4308 
4309    Notes: does not trigger host-device copies and flags data validity on the GPU
4310 
4311 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4312 @*/
4313 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4314 {
4315   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4316   CsrMatrix          *csr;
4317   PetscErrorCode     ierr;
4318 
4319   PetscFunctionBegin;
4320   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4321   PetscValidPointer(a,2);
4322   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4323   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4324   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4325   csr = (CsrMatrix*)cusp->mat->mat;
4326   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4327   *a = csr->values->data().get();
4328   A->offloadmask = PETSC_OFFLOAD_GPU;
4329   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4330   PetscFunctionReturn(0);
4331 }
4332 
4333 /*@C
4334    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4335 
4336    Not Collective
4337 
4338    Input Parameter:
4339 .   A - a MATSEQAIJCUSPARSE matrix
4340 
4341    Output Parameter:
4342 .   a - pointer to the device data
4343 
4344    Level: developer
4345 
4346 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4347 @*/
4348 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4349 {
4350   PetscErrorCode ierr;
4351 
4352   PetscFunctionBegin;
4353   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4354   PetscValidPointer(a,2);
4355   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4356   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4357   *a = NULL;
4358   PetscFunctionReturn(0);
4359 }
4360 
4361 struct IJCompare4
4362 {
4363   __host__ __device__
4364   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4365   {
4366     if (t1.get<0>() < t2.get<0>()) return true;
4367     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4368     return false;
4369   }
4370 };
4371 
4372 struct Shift
4373 {
4374   int _shift;
4375 
4376   Shift(int shift) : _shift(shift) {}
4377   __host__ __device__
4378   inline int operator() (const int &c)
4379   {
4380     return c + _shift;
4381   }
4382 };
4383 
4384 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4385 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4386 {
4387   PetscErrorCode               ierr;
4388   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4389   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4390   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4391   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4392   PetscInt                     Annz,Bnnz;
4393   cusparseStatus_t             stat;
4394   PetscInt                     i,m,n,zero = 0;
4395   cudaError_t                  cerr;
4396 
4397   PetscFunctionBegin;
4398   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4399   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4400   PetscValidPointer(C,4);
4401   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4402   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4403   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4404   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4405   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4406   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4407   if (reuse == MAT_INITIAL_MATRIX) {
4408     m     = A->rmap->n;
4409     n     = A->cmap->n + B->cmap->n;
4410     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4411     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4412     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4413     c     = (Mat_SeqAIJ*)(*C)->data;
4414     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4415     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4416     Ccsr  = new CsrMatrix;
4417     Cmat->cprowIndices      = NULL;
4418     c->compressedrow.use    = PETSC_FALSE;
4419     c->compressedrow.nrows  = 0;
4420     c->compressedrow.i      = NULL;
4421     c->compressedrow.rindex = NULL;
4422     Ccusp->workVector       = NULL;
4423     Ccusp->nrows    = m;
4424     Ccusp->mat      = Cmat;
4425     Ccusp->mat->mat = Ccsr;
4426     Ccsr->num_rows  = m;
4427     Ccsr->num_cols  = n;
4428     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4429     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4430     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4431     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4432     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4433     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4434     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4435     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4436     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4437     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4438     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4439     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4440     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4441 
4442     Acsr = (CsrMatrix*)Acusp->mat->mat;
4443     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4444     Annz = (PetscInt)Acsr->column_indices->size();
4445     Bnnz = (PetscInt)Bcsr->column_indices->size();
4446     c->nz = Annz + Bnnz;
4447     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4448     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4449     Ccsr->values = new THRUSTARRAY(c->nz);
4450     Ccsr->num_entries = c->nz;
4451     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4452     if (c->nz) {
4453       auto Acoo = new THRUSTINTARRAY32(Annz);
4454       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4455       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4456       THRUSTINTARRAY32 *Aroff,*Broff;
4457 
4458       if (a->compressedrow.use) { /* need full row offset */
4459         if (!Acusp->rowoffsets_gpu) {
4460           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4461           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4462           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4463         }
4464         Aroff = Acusp->rowoffsets_gpu;
4465       } else Aroff = Acsr->row_offsets;
4466       if (b->compressedrow.use) { /* need full row offset */
4467         if (!Bcusp->rowoffsets_gpu) {
4468           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4469           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4470           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4471         }
4472         Broff = Bcusp->rowoffsets_gpu;
4473       } else Broff = Bcsr->row_offsets;
4474       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4475       stat = cusparseXcsr2coo(Acusp->handle,
4476                               Aroff->data().get(),
4477                               Annz,
4478                               m,
4479                               Acoo->data().get(),
4480                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4481       stat = cusparseXcsr2coo(Bcusp->handle,
4482                               Broff->data().get(),
4483                               Bnnz,
4484                               m,
4485                               Bcoo->data().get(),
4486                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4487       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4488       auto Aperm = thrust::make_constant_iterator(1);
4489       auto Bperm = thrust::make_constant_iterator(0);
4490 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4491       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4492       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4493 #else
4494       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4495       auto Bcib = Bcsr->column_indices->begin();
4496       auto Bcie = Bcsr->column_indices->end();
4497       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4498 #endif
4499       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4500       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4501       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4502       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4503       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4504       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4505       auto p1 = Ccusp->cooPerm->begin();
4506       auto p2 = Ccusp->cooPerm->begin();
4507       thrust::advance(p2,Annz);
4508       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4509 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4510       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4511 #endif
4512       auto cci = thrust::make_counting_iterator(zero);
4513       auto cce = thrust::make_counting_iterator(c->nz);
4514 #if 0 //Errors on SUMMIT cuda 11.1.0
4515       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4516 #else
4517       auto pred = thrust::identity<int>();
4518       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4519       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4520 #endif
4521       stat = cusparseXcoo2csr(Ccusp->handle,
4522                               Ccoo->data().get(),
4523                               c->nz,
4524                               m,
4525                               Ccsr->row_offsets->data().get(),
4526                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4527       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4528       delete wPerm;
4529       delete Acoo;
4530       delete Bcoo;
4531       delete Ccoo;
4532 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4533       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4534                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4535                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4536                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4537 #endif
4538       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4539         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4540         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4541         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4542         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4543         CsrMatrix *CcsrT = new CsrMatrix;
4544         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4545         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4546 
4547         (*C)->form_explicit_transpose = PETSC_TRUE;
4548         (*C)->transupdated = PETSC_TRUE;
4549         Ccusp->rowoffsets_gpu = NULL;
4550         CmatT->cprowIndices = NULL;
4551         CmatT->mat = CcsrT;
4552         CcsrT->num_rows = n;
4553         CcsrT->num_cols = m;
4554         CcsrT->num_entries = c->nz;
4555 
4556         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4557         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4558         CcsrT->values = new THRUSTARRAY(c->nz);
4559 
4560         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4561         auto rT = CcsrT->row_offsets->begin();
4562         if (AT) {
4563           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4564           thrust::advance(rT,-1);
4565         }
4566         if (BT) {
4567           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4568           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4569           thrust::copy(titb,tite,rT);
4570         }
4571         auto cT = CcsrT->column_indices->begin();
4572         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4573         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4574         auto vT = CcsrT->values->begin();
4575         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4576         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4577         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4578 
4579         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4580         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4581         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4582         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4583         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4584         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4585         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4586         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4587         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4588 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4589         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4590                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4591                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4592                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4593 #endif
4594         Ccusp->matTranspose = CmatT;
4595       }
4596     }
4597 
4598     c->singlemalloc = PETSC_FALSE;
4599     c->free_a       = PETSC_TRUE;
4600     c->free_ij      = PETSC_TRUE;
4601     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4602     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4603     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4604       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4605       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4606       ii   = *Ccsr->row_offsets;
4607       jj   = *Ccsr->column_indices;
4608       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4610     } else {
4611       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4612       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4613     }
4614     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4615     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4616     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4617     c->maxnz = c->nz;
4618     c->nonzerorowcnt = 0;
4619     c->rmax = 0;
4620     for (i = 0; i < m; i++) {
4621       const PetscInt nn = c->i[i+1] - c->i[i];
4622       c->ilen[i] = c->imax[i] = nn;
4623       c->nonzerorowcnt += (PetscInt)!!nn;
4624       c->rmax = PetscMax(c->rmax,nn);
4625     }
4626     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4627     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4628     (*C)->nonzerostate++;
4629     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4630     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4631     Ccusp->nonzerostate = (*C)->nonzerostate;
4632     (*C)->preallocated  = PETSC_TRUE;
4633   } else {
4634     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4635     c = (Mat_SeqAIJ*)(*C)->data;
4636     if (c->nz) {
4637       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4638       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4639       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4640       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4641       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4642       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4643       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4644       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4645       Acsr = (CsrMatrix*)Acusp->mat->mat;
4646       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4647       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4648       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4649       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4650       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4651       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4652       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4653       auto pmid = Ccusp->cooPerm->begin();
4654       thrust::advance(pmid,Acsr->num_entries);
4655       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4656       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4657                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4658       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4659                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4660       thrust::for_each(zibait,zieait,VecCUDAEquals());
4661       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4662                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4663       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4664                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4665       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4666       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4667       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4668         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4669         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4670         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4671         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4672         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4673         auto vT = CcsrT->values->begin();
4674         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4675         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4676         (*C)->transupdated = PETSC_TRUE;
4677       }
4678       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4679     }
4680   }
4681   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4682   (*C)->assembled     = PETSC_TRUE;
4683   (*C)->was_assembled = PETSC_FALSE;
4684   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4685   PetscFunctionReturn(0);
4686 }
4687 
4688 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4689 {
4690   PetscErrorCode    ierr;
4691   bool              dmem;
4692   const PetscScalar *av;
4693   cudaError_t       cerr;
4694 
4695   PetscFunctionBegin;
4696   dmem = isCudaMem(v);
4697   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4698   if (n && idx) {
4699     THRUSTINTARRAY widx(n);
4700     widx.assign(idx,idx+n);
4701     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4702 
4703     THRUSTARRAY *w = NULL;
4704     thrust::device_ptr<PetscScalar> dv;
4705     if (dmem) {
4706       dv = thrust::device_pointer_cast(v);
4707     } else {
4708       w = new THRUSTARRAY(n);
4709       dv = w->data();
4710     }
4711     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4712 
4713     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4714     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4715     thrust::for_each(zibit,zieit,VecCUDAEquals());
4716     if (w) {
4717       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4718     }
4719     delete w;
4720   } else {
4721     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4722   }
4723   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4724   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4725   PetscFunctionReturn(0);
4726 }
4727