xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2c71b3e237ead271e4f3aa1505f92bf476e3413d)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 
93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94 {
95   cusparseStatus_t   stat;
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97 
98   PetscFunctionBegin;
99   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100   cusparsestruct->stream = stream;
101   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102   PetscFunctionReturn(0);
103 }
104 
105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106 {
107   cusparseStatus_t   stat;
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
115     }
116     cusparsestruct->handle = handle;
117   }
118   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126   PetscErrorCode     ierr;
127 
128   PetscFunctionBegin;
129   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
130   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
136 {
137   PetscFunctionBegin;
138   *type = MATSOLVERCUSPARSE;
139   PetscFunctionReturn(0);
140 }
141 
142 /*MC
143   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148   algorithms are not recommended. This class does NOT support direct solver operations.
149 
150   Level: beginner
151 
152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153 M*/
154 
155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
156 {
157   PetscErrorCode ierr;
158   PetscInt       n = A->rmap->n;
159 
160   PetscFunctionBegin;
161   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
163   (*B)->factortype = ftype;
164   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
165 
166   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
167   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
168     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
169     if (!A->boundtocpu) {
170       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
171       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172     } else {
173       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
174       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
175     }
176     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
178     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
179   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
180     if (!A->boundtocpu) {
181       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
182       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
183     } else {
184       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
185       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
186     }
187     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
188     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
189   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
190 
191   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
192   (*B)->canuseordering = PETSC_TRUE;
193   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 }
196 
197 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
198 {
199   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
200 
201   PetscFunctionBegin;
202   switch (op) {
203   case MAT_CUSPARSE_MULT:
204     cusparsestruct->format = format;
205     break;
206   case MAT_CUSPARSE_ALL:
207     cusparsestruct->format = format;
208     break;
209   default:
210     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
211   }
212   PetscFunctionReturn(0);
213 }
214 
215 /*@
216    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
217    operation. Only the MatMult operation can use different GPU storage formats
218    for MPIAIJCUSPARSE matrices.
219    Not Collective
220 
221    Input Parameters:
222 +  A - Matrix of type SEQAIJCUSPARSE
223 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
224 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
225 
226    Output Parameter:
227 
228    Level: intermediate
229 
230 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
231 @*/
232 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
233 {
234   PetscErrorCode ierr;
235 
236   PetscFunctionBegin;
237   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
238   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
243 {
244   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
245 
246   PetscFunctionBegin;
247   cusparsestruct->use_cpu_solve = use_cpu;
248   PetscFunctionReturn(0);
249 }
250 
251 /*@
252    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
253 
254    Input Parameters:
255 +  A - Matrix of type SEQAIJCUSPARSE
256 -  use_cpu - set flag for using the built-in CPU MatSolve
257 
258    Output Parameter:
259 
260    Notes:
261    The cuSparse LU solver currently computes the factors with the built-in CPU method
262    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
263    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
264 
265    Level: intermediate
266 
267 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
268 @*/
269 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
270 {
271   PetscErrorCode ierr;
272 
273   PetscFunctionBegin;
274   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
275   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
276   PetscFunctionReturn(0);
277 }
278 
279 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
280 {
281   PetscErrorCode ierr;
282 
283   PetscFunctionBegin;
284   switch (op) {
285     case MAT_FORM_EXPLICIT_TRANSPOSE:
286       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
287       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
288       A->form_explicit_transpose = flg;
289       break;
290     default:
291       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
292       break;
293   }
294   PetscFunctionReturn(0);
295 }
296 
297 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
298 
299 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
300 {
301   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
302   IS             isrow = b->row,iscol = b->col;
303   PetscBool      row_identity,col_identity;
304   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
305   PetscErrorCode ierr;
306 
307   PetscFunctionBegin;
308   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
309   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
310   B->offloadmask = PETSC_OFFLOAD_CPU;
311   /* determine which version of MatSolve needs to be used. */
312   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
313   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
314   if (row_identity && col_identity) {
315     if (!cusparsestruct->use_cpu_solve) {
316       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
317       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
318     }
319     B->ops->matsolve = NULL;
320     B->ops->matsolvetranspose = NULL;
321   } else {
322     if (!cusparsestruct->use_cpu_solve) {
323       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
324       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
325     }
326     B->ops->matsolve = NULL;
327     B->ops->matsolvetranspose = NULL;
328   }
329 
330   /* get the triangular factors */
331   if (!cusparsestruct->use_cpu_solve) {
332     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
333   }
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
338 {
339   PetscErrorCode           ierr;
340   MatCUSPARSEStorageFormat format;
341   PetscBool                flg;
342   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
343 
344   PetscFunctionBegin;
345   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
346   if (A->factortype == MAT_FACTOR_NONE) {
347     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
348                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
349     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
350 
351     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
352                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
353     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
354     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
355     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
356 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
357     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
358                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
359     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
360 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
361     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
362 #else
363     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
364 #endif
365     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
366                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
367     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
368 
369     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
370                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
371     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
372    #endif
373   }
374   ierr = PetscOptionsTail();CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
379 {
380   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
381   PetscErrorCode               ierr;
382 
383   PetscFunctionBegin;
384   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
385   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
386   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
387   PetscFunctionReturn(0);
388 }
389 
390 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
391 {
392   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
393   PetscErrorCode               ierr;
394 
395   PetscFunctionBegin;
396   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
397   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
398   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
399   PetscFunctionReturn(0);
400 }
401 
402 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
403 {
404   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
405   PetscErrorCode               ierr;
406 
407   PetscFunctionBegin;
408   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
409   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
410   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
411   PetscFunctionReturn(0);
412 }
413 
414 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
415 {
416   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
417   PetscErrorCode               ierr;
418 
419   PetscFunctionBegin;
420   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
421   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
422   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
423   PetscFunctionReturn(0);
424 }
425 
426 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
427 {
428   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
429   PetscInt                          n = A->rmap->n;
430   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
431   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
432   cusparseStatus_t                  stat;
433   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
434   const MatScalar                   *aa = a->a,*v;
435   PetscInt                          *AiLo, *AjLo;
436   PetscInt                          i,nz, nzLower, offset, rowOffset;
437   PetscErrorCode                    ierr;
438   cudaError_t                       cerr;
439 
440   PetscFunctionBegin;
441   if (!n) PetscFunctionReturn(0);
442   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
443     try {
444       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
445       nzLower=n+ai[n]-ai[1];
446       if (!loTriFactor) {
447         PetscScalar                       *AALo;
448 
449         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
450 
451         /* Allocate Space for the lower triangular matrix */
452         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
453         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
454 
455         /* Fill the lower triangular matrix */
456         AiLo[0]  = (PetscInt) 0;
457         AiLo[n]  = nzLower;
458         AjLo[0]  = (PetscInt) 0;
459         AALo[0]  = (MatScalar) 1.0;
460         v        = aa;
461         vi       = aj;
462         offset   = 1;
463         rowOffset= 1;
464         for (i=1; i<n; i++) {
465           nz = ai[i+1] - ai[i];
466           /* additional 1 for the term on the diagonal */
467           AiLo[i]    = rowOffset;
468           rowOffset += nz+1;
469 
470           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
471           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
472 
473           offset      += nz;
474           AjLo[offset] = (PetscInt) i;
475           AALo[offset] = (MatScalar) 1.0;
476           offset      += 1;
477 
478           v  += nz;
479           vi += nz;
480         }
481 
482         /* allocate space for the triangular factor information */
483         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
484         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
485         /* Create the matrix description */
486         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
487         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
488        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
489         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
490        #else
491         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
492        #endif
493         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
494         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
495 
496         /* set the operation */
497         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
498 
499         /* set the matrix */
500         loTriFactor->csrMat = new CsrMatrix;
501         loTriFactor->csrMat->num_rows = n;
502         loTriFactor->csrMat->num_cols = n;
503         loTriFactor->csrMat->num_entries = nzLower;
504 
505         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
506         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
507 
508         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
509         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
510 
511         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
512         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
513 
514         /* Create the solve analysis information */
515         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
516         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
517       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
518         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
519                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
520                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
521                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
522                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
523         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
524       #endif
525 
526         /* perform the solve analysis */
527         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
528                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
529                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
530                                  loTriFactor->csrMat->column_indices->data().get(),
531                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
532                                  loTriFactor->solveInfo,
533                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
534                                #else
535                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
536                                #endif
537         cerr = WaitForCUDA();CHKERRCUDA(cerr);
538         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
539 
540         /* assign the pointer */
541         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
542         loTriFactor->AA_h = AALo;
543         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
544         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
545         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
546       } else { /* update values only */
547         if (!loTriFactor->AA_h) {
548           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
549         }
550         /* Fill the lower triangular matrix */
551         loTriFactor->AA_h[0]  = 1.0;
552         v        = aa;
553         vi       = aj;
554         offset   = 1;
555         for (i=1; i<n; i++) {
556           nz = ai[i+1] - ai[i];
557           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
558           offset      += nz;
559           loTriFactor->AA_h[offset] = 1.0;
560           offset      += 1;
561           v  += nz;
562         }
563         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
564         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
565       }
566     } catch(char *ex) {
567       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
568     }
569   }
570   PetscFunctionReturn(0);
571 }
572 
573 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
574 {
575   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
576   PetscInt                          n = A->rmap->n;
577   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
578   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
579   cusparseStatus_t                  stat;
580   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
581   const MatScalar                   *aa = a->a,*v;
582   PetscInt                          *AiUp, *AjUp;
583   PetscInt                          i,nz, nzUpper, offset;
584   PetscErrorCode                    ierr;
585   cudaError_t                       cerr;
586 
587   PetscFunctionBegin;
588   if (!n) PetscFunctionReturn(0);
589   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
590     try {
591       /* next, figure out the number of nonzeros in the upper triangular matrix. */
592       nzUpper = adiag[0]-adiag[n];
593       if (!upTriFactor) {
594         PetscScalar *AAUp;
595 
596         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
597 
598         /* Allocate Space for the upper triangular matrix */
599         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
600         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
601 
602         /* Fill the upper triangular matrix */
603         AiUp[0]=(PetscInt) 0;
604         AiUp[n]=nzUpper;
605         offset = nzUpper;
606         for (i=n-1; i>=0; i--) {
607           v  = aa + adiag[i+1] + 1;
608           vi = aj + adiag[i+1] + 1;
609 
610           /* number of elements NOT on the diagonal */
611           nz = adiag[i] - adiag[i+1]-1;
612 
613           /* decrement the offset */
614           offset -= (nz+1);
615 
616           /* first, set the diagonal elements */
617           AjUp[offset] = (PetscInt) i;
618           AAUp[offset] = (MatScalar)1./v[nz];
619           AiUp[i]      = AiUp[i+1] - (nz+1);
620 
621           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
622           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
623         }
624 
625         /* allocate space for the triangular factor information */
626         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
627         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
628 
629         /* Create the matrix description */
630         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
631         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
632        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
633         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
634        #else
635         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
636        #endif
637         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
638         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
639 
640         /* set the operation */
641         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
642 
643         /* set the matrix */
644         upTriFactor->csrMat = new CsrMatrix;
645         upTriFactor->csrMat->num_rows = n;
646         upTriFactor->csrMat->num_cols = n;
647         upTriFactor->csrMat->num_entries = nzUpper;
648 
649         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
650         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
651 
652         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
653         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
654 
655         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
656         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
657 
658         /* Create the solve analysis information */
659         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
660         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
661       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
662         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
663                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
664                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
665                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
666                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
667         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
668       #endif
669 
670         /* perform the solve analysis */
671         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
672                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
673                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
674                                  upTriFactor->csrMat->column_indices->data().get(),
675                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
676                                  upTriFactor->solveInfo,
677                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
678                                #else
679                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
680                                #endif
681         cerr = WaitForCUDA();CHKERRCUDA(cerr);
682         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
683 
684         /* assign the pointer */
685         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
686         upTriFactor->AA_h = AAUp;
687         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
688         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
689         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
690       } else {
691         if (!upTriFactor->AA_h) {
692           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
693         }
694         /* Fill the upper triangular matrix */
695         offset = nzUpper;
696         for (i=n-1; i>=0; i--) {
697           v  = aa + adiag[i+1] + 1;
698 
699           /* number of elements NOT on the diagonal */
700           nz = adiag[i] - adiag[i+1]-1;
701 
702           /* decrement the offset */
703           offset -= (nz+1);
704 
705           /* first, set the diagonal elements */
706           upTriFactor->AA_h[offset] = 1./v[nz];
707           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
708         }
709         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
710         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
711       }
712     } catch(char *ex) {
713       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
714     }
715   }
716   PetscFunctionReturn(0);
717 }
718 
719 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
720 {
721   PetscErrorCode               ierr;
722   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
723   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
724   IS                           isrow = a->row,iscol = a->icol;
725   PetscBool                    row_identity,col_identity;
726   PetscInt                     n = A->rmap->n;
727 
728   PetscFunctionBegin;
729   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
730   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
731   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
732 
733   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
734   cusparseTriFactors->nnz=a->nz;
735 
736   A->offloadmask = PETSC_OFFLOAD_BOTH;
737   /* lower triangular indices */
738   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
739   if (!row_identity && !cusparseTriFactors->rpermIndices) {
740     const PetscInt *r;
741 
742     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
743     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
744     cusparseTriFactors->rpermIndices->assign(r, r+n);
745     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
746     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
747   }
748 
749   /* upper triangular indices */
750   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
751   if (!col_identity && !cusparseTriFactors->cpermIndices) {
752     const PetscInt *c;
753 
754     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
755     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
756     cusparseTriFactors->cpermIndices->assign(c, c+n);
757     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
758     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
759   }
760   PetscFunctionReturn(0);
761 }
762 
763 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
764 {
765   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
766   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
767   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
768   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
769   cusparseStatus_t                  stat;
770   PetscErrorCode                    ierr;
771   cudaError_t                       cerr;
772   PetscInt                          *AiUp, *AjUp;
773   PetscScalar                       *AAUp;
774   PetscScalar                       *AALo;
775   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
776   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
777   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
778   const MatScalar                   *aa = b->a,*v;
779 
780   PetscFunctionBegin;
781   if (!n) PetscFunctionReturn(0);
782   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
783     try {
784       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
785       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
786       if (!upTriFactor && !loTriFactor) {
787         /* Allocate Space for the upper triangular matrix */
788         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
789         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
790 
791         /* Fill the upper triangular matrix */
792         AiUp[0]=(PetscInt) 0;
793         AiUp[n]=nzUpper;
794         offset = 0;
795         for (i=0; i<n; i++) {
796           /* set the pointers */
797           v  = aa + ai[i];
798           vj = aj + ai[i];
799           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
800 
801           /* first, set the diagonal elements */
802           AjUp[offset] = (PetscInt) i;
803           AAUp[offset] = (MatScalar)1.0/v[nz];
804           AiUp[i]      = offset;
805           AALo[offset] = (MatScalar)1.0/v[nz];
806 
807           offset+=1;
808           if (nz>0) {
809             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
810             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
811             for (j=offset; j<offset+nz; j++) {
812               AAUp[j] = -AAUp[j];
813               AALo[j] = AAUp[j]/v[nz];
814             }
815             offset+=nz;
816           }
817         }
818 
819         /* allocate space for the triangular factor information */
820         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
821         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822 
823         /* Create the matrix description */
824         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
825         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
826        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
827         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
828        #else
829         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
830        #endif
831         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
832         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
833 
834         /* set the matrix */
835         upTriFactor->csrMat = new CsrMatrix;
836         upTriFactor->csrMat->num_rows = A->rmap->n;
837         upTriFactor->csrMat->num_cols = A->cmap->n;
838         upTriFactor->csrMat->num_entries = a->nz;
839 
840         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
841         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
842 
843         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
844         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
845 
846         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
847         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
848 
849         /* set the operation */
850         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
851 
852         /* Create the solve analysis information */
853         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
855       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
856         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
857                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
858                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
859                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
860                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
861         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
862       #endif
863 
864         /* perform the solve analysis */
865         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
866                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
867                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
868                                  upTriFactor->csrMat->column_indices->data().get(),
869                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
870                                  upTriFactor->solveInfo,
871                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
872                                 #else
873                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
874                                 #endif
875         cerr = WaitForCUDA();CHKERRCUDA(cerr);
876         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
877 
878         /* assign the pointer */
879         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
880 
881         /* allocate space for the triangular factor information */
882         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
883         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
884 
885         /* Create the matrix description */
886         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
887         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
888        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
889         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
890        #else
891         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
892        #endif
893         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
894         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
895 
896         /* set the operation */
897         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
898 
899         /* set the matrix */
900         loTriFactor->csrMat = new CsrMatrix;
901         loTriFactor->csrMat->num_rows = A->rmap->n;
902         loTriFactor->csrMat->num_cols = A->cmap->n;
903         loTriFactor->csrMat->num_entries = a->nz;
904 
905         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
906         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
907 
908         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
909         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
910 
911         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
912         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
913 
914         /* Create the solve analysis information */
915         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
916         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
917       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
918         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
919                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
920                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
921                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
922                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
923         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
924       #endif
925 
926         /* perform the solve analysis */
927         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
928                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
929                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
930                                  loTriFactor->csrMat->column_indices->data().get(),
931                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
932                                  loTriFactor->solveInfo,
933                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
934                                 #else
935                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
936                                 #endif
937         cerr = WaitForCUDA();CHKERRCUDA(cerr);
938         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
939 
940         /* assign the pointer */
941         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
942 
943         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
944         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
945         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
946       } else {
947         /* Fill the upper triangular matrix */
948         offset = 0;
949         for (i=0; i<n; i++) {
950           /* set the pointers */
951           v  = aa + ai[i];
952           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
953 
954           /* first, set the diagonal elements */
955           AAUp[offset] = 1.0/v[nz];
956           AALo[offset] = 1.0/v[nz];
957 
958           offset+=1;
959           if (nz>0) {
960             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
961             for (j=offset; j<offset+nz; j++) {
962               AAUp[j] = -AAUp[j];
963               AALo[j] = AAUp[j]/v[nz];
964             }
965             offset+=nz;
966           }
967         }
968         PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
969         PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
971         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
972         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
973       }
974       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
975       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
976     } catch(char *ex) {
977       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
978     }
979   }
980   PetscFunctionReturn(0);
981 }
982 
983 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
984 {
985   PetscErrorCode               ierr;
986   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
987   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
988   IS                           ip = a->row;
989   PetscBool                    perm_identity;
990   PetscInt                     n = A->rmap->n;
991 
992   PetscFunctionBegin;
993   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
994   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
995   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
996   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
997 
998   A->offloadmask = PETSC_OFFLOAD_BOTH;
999 
1000   /* lower triangular indices */
1001   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1002   if (!perm_identity) {
1003     IS             iip;
1004     const PetscInt *irip,*rip;
1005 
1006     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1007     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1008     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1009     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1010     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1011     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1012     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1013     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1014     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1015     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1016     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1017   }
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1022 {
1023   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1024   IS             ip = b->row;
1025   PetscBool      perm_identity;
1026   PetscErrorCode ierr;
1027 
1028   PetscFunctionBegin;
1029   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1030   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1031   B->offloadmask = PETSC_OFFLOAD_CPU;
1032   /* determine which version of MatSolve needs to be used. */
1033   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1034   if (perm_identity) {
1035     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1036     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1037     B->ops->matsolve = NULL;
1038     B->ops->matsolvetranspose = NULL;
1039   } else {
1040     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1041     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1042     B->ops->matsolve = NULL;
1043     B->ops->matsolvetranspose = NULL;
1044   }
1045 
1046   /* get the triangular factors */
1047   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1052 {
1053   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1055   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1056   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1057   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1058   cusparseStatus_t                  stat;
1059   cusparseIndexBase_t               indexBase;
1060   cusparseMatrixType_t              matrixType;
1061   cusparseFillMode_t                fillMode;
1062   cusparseDiagType_t                diagType;
1063   cudaError_t                       cerr;
1064   PetscErrorCode                    ierr;
1065 
1066   PetscFunctionBegin;
1067   /* allocate space for the transpose of the lower triangular factor */
1068   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1069   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1070 
1071   /* set the matrix descriptors of the lower triangular factor */
1072   matrixType = cusparseGetMatType(loTriFactor->descr);
1073   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1074   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1075     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1076   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1077 
1078   /* Create the matrix description */
1079   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1080   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1081   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1082   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1083   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1084 
1085   /* set the operation */
1086   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1087 
1088   /* allocate GPU space for the CSC of the lower triangular factor*/
1089   loTriFactorT->csrMat = new CsrMatrix;
1090   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1091   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1092   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1093   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1094   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1095   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1096 
1097   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1098 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1099   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1100                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1101                                        loTriFactor->csrMat->values->data().get(),
1102                                        loTriFactor->csrMat->row_offsets->data().get(),
1103                                        loTriFactor->csrMat->column_indices->data().get(),
1104                                        loTriFactorT->csrMat->values->data().get(),
1105                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1106                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1107                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1108   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1109 #endif
1110 
1111   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1112   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1113                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1114                           loTriFactor->csrMat->values->data().get(),
1115                           loTriFactor->csrMat->row_offsets->data().get(),
1116                           loTriFactor->csrMat->column_indices->data().get(),
1117                           loTriFactorT->csrMat->values->data().get(),
1118                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1119                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1120                           CUSPARSE_ACTION_NUMERIC, indexBase,
1121                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1122                         #else
1123                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1124                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1125                         #endif
1126   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1127   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1128 
1129   /* Create the solve analysis information */
1130   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1131   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1132 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1133   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1134                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1135                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1136                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1137                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1138   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1139 #endif
1140 
1141   /* perform the solve analysis */
1142   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1143                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1144                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1145                            loTriFactorT->csrMat->column_indices->data().get(),
1146                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1147                            loTriFactorT->solveInfo,
1148                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1149                           #else
1150                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1151                           #endif
1152   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1154 
1155   /* assign the pointer */
1156   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1157 
1158   /*********************************************/
1159   /* Now the Transpose of the Upper Tri Factor */
1160   /*********************************************/
1161 
1162   /* allocate space for the transpose of the upper triangular factor */
1163   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1164   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1165 
1166   /* set the matrix descriptors of the upper triangular factor */
1167   matrixType = cusparseGetMatType(upTriFactor->descr);
1168   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1169   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1170     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1171   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1172 
1173   /* Create the matrix description */
1174   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1175   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1176   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1177   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1178   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1179 
1180   /* set the operation */
1181   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1182 
1183   /* allocate GPU space for the CSC of the upper triangular factor*/
1184   upTriFactorT->csrMat = new CsrMatrix;
1185   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1186   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1187   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1188   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1189   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1190   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1191 
1192   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1193 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1194   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1195                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1196                                 upTriFactor->csrMat->values->data().get(),
1197                                 upTriFactor->csrMat->row_offsets->data().get(),
1198                                 upTriFactor->csrMat->column_indices->data().get(),
1199                                 upTriFactorT->csrMat->values->data().get(),
1200                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1201                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1202                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1203   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1204 #endif
1205 
1206   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1207   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1208                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1209                           upTriFactor->csrMat->values->data().get(),
1210                           upTriFactor->csrMat->row_offsets->data().get(),
1211                           upTriFactor->csrMat->column_indices->data().get(),
1212                           upTriFactorT->csrMat->values->data().get(),
1213                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1214                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1215                           CUSPARSE_ACTION_NUMERIC, indexBase,
1216                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1217                         #else
1218                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1219                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1220                         #endif
1221 
1222   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1223   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224 
1225   /* Create the solve analysis information */
1226   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1227   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1228   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1229   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1230                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1231                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1232                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1233                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1234   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1235   #endif
1236 
1237   /* perform the solve analysis */
1238   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1239                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1240                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1241                            upTriFactorT->csrMat->column_indices->data().get(),
1242                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1243                            upTriFactorT->solveInfo,
1244                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1245                           #else
1246                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1247                           #endif
1248 
1249   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1250   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1251 
1252   /* assign the pointer */
1253   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 struct PetscScalarToPetscInt
1258 {
1259   __host__ __device__
1260   PetscInt operator()(PetscScalar s)
1261   {
1262     return (PetscInt)PetscRealPart(s);
1263   }
1264 };
1265 
1266 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1267 {
1268   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1269   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1270   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1271   cusparseStatus_t             stat;
1272   cusparseIndexBase_t          indexBase;
1273   cudaError_t                  err;
1274   PetscErrorCode               ierr;
1275 
1276   PetscFunctionBegin;
1277   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1278   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1279   PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1280   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1281   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1282   if (A->transupdated) PetscFunctionReturn(0);
1283   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1284   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1285   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1286     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1287   }
1288   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1289     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1290     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1291     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1292     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1293     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1294 
1295     /* set alpha and beta */
1296     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1297     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1298     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1299     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1300     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1301     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1302 
1303     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1304       CsrMatrix *matrixT = new CsrMatrix;
1305       matstructT->mat = matrixT;
1306       matrixT->num_rows = A->cmap->n;
1307       matrixT->num_cols = A->rmap->n;
1308       matrixT->num_entries = a->nz;
1309       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1310       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1311       matrixT->values = new THRUSTARRAY(a->nz);
1312 
1313       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1314       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1315 
1316      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1317       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1318         stat = cusparseCreateCsr(&matstructT->matDescr,
1319                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1320                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1321                                matrixT->values->data().get(),
1322                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1323                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1324       #else
1325         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1326            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1327 
1328            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1329            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1330            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1331         */
1332         if (matrixT->num_entries) {
1333           stat = cusparseCreateCsr(&matstructT->matDescr,
1334                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1335                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1336                                  matrixT->values->data().get(),
1337                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1338                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1339 
1340         } else {
1341           matstructT->matDescr = NULL;
1342           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1343         }
1344       #endif
1345      #endif
1346     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1347    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1348       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1349    #else
1350       CsrMatrix *temp  = new CsrMatrix;
1351       CsrMatrix *tempT = new CsrMatrix;
1352       /* First convert HYB to CSR */
1353       temp->num_rows = A->rmap->n;
1354       temp->num_cols = A->cmap->n;
1355       temp->num_entries = a->nz;
1356       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1357       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1358       temp->values = new THRUSTARRAY(a->nz);
1359 
1360       stat = cusparse_hyb2csr(cusparsestruct->handle,
1361                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1362                               temp->values->data().get(),
1363                               temp->row_offsets->data().get(),
1364                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1365 
1366       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1367       tempT->num_rows = A->rmap->n;
1368       tempT->num_cols = A->cmap->n;
1369       tempT->num_entries = a->nz;
1370       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1371       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1372       tempT->values = new THRUSTARRAY(a->nz);
1373 
1374       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1375                               temp->num_cols, temp->num_entries,
1376                               temp->values->data().get(),
1377                               temp->row_offsets->data().get(),
1378                               temp->column_indices->data().get(),
1379                               tempT->values->data().get(),
1380                               tempT->column_indices->data().get(),
1381                               tempT->row_offsets->data().get(),
1382                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1383 
1384       /* Last, convert CSC to HYB */
1385       cusparseHybMat_t hybMat;
1386       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1387       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1388         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1389       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1390                               matstructT->descr, tempT->values->data().get(),
1391                               tempT->row_offsets->data().get(),
1392                               tempT->column_indices->data().get(),
1393                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1394 
1395       /* assign the pointer */
1396       matstructT->mat = hybMat;
1397       A->transupdated = PETSC_TRUE;
1398       /* delete temporaries */
1399       if (tempT) {
1400         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1401         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1402         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1403         delete (CsrMatrix*) tempT;
1404       }
1405       if (temp) {
1406         if (temp->values) delete (THRUSTARRAY*) temp->values;
1407         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1408         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1409         delete (CsrMatrix*) temp;
1410       }
1411      #endif
1412     }
1413   }
1414   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1415     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1416     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1417     PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1418     PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1419     PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1420     PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1421     PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1422     PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1423     PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1424     PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1425     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1426       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1427       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1428       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1429     }
1430     if (!cusparsestruct->csr2csc_i) {
1431       THRUSTARRAY csr2csc_a(matrix->num_entries);
1432       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1433 
1434       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1435      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1436       void   *csr2cscBuffer;
1437       size_t csr2cscBufferSize;
1438       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1439                                            A->cmap->n, matrix->num_entries,
1440                                            matrix->values->data().get(),
1441                                            cusparsestruct->rowoffsets_gpu->data().get(),
1442                                            matrix->column_indices->data().get(),
1443                                            matrixT->values->data().get(),
1444                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1445                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1446                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1447       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1448      #endif
1449 
1450       if (matrix->num_entries) {
1451         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1452            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1453            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1454 
1455            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1456            should be filled with indexBase. So I just take a shortcut here.
1457         */
1458         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1459                               A->cmap->n,matrix->num_entries,
1460                               csr2csc_a.data().get(),
1461                               cusparsestruct->rowoffsets_gpu->data().get(),
1462                               matrix->column_indices->data().get(),
1463                               matrixT->values->data().get(),
1464                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1465                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1466                               CUSPARSE_ACTION_NUMERIC,indexBase,
1467                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1468                              #else
1469                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1470                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1471                              #endif
1472       } else {
1473         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1474       }
1475 
1476       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1477       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1478      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1479       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1480      #endif
1481     }
1482     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1483                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1484                                                      matrixT->values->begin()));
1485   }
1486   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1487   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1488   /* the compressed row indices is not used for matTranspose */
1489   matstructT->cprowIndices = NULL;
1490   /* assign the pointer */
1491   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1492   A->transupdated = PETSC_TRUE;
1493   PetscFunctionReturn(0);
1494 }
1495 
1496 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1497 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1498 {
1499   PetscInt                              n = xx->map->n;
1500   const PetscScalar                     *barray;
1501   PetscScalar                           *xarray;
1502   thrust::device_ptr<const PetscScalar> bGPU;
1503   thrust::device_ptr<PetscScalar>       xGPU;
1504   cusparseStatus_t                      stat;
1505   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1506   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1507   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1508   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1509   PetscErrorCode                        ierr;
1510 
1511   PetscFunctionBegin;
1512   /* Analyze the matrix and create the transpose ... on the fly */
1513   if (!loTriFactorT && !upTriFactorT) {
1514     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1515     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1516     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1517   }
1518 
1519   /* Get the GPU pointers */
1520   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1521   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1522   xGPU = thrust::device_pointer_cast(xarray);
1523   bGPU = thrust::device_pointer_cast(barray);
1524 
1525   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1526   /* First, reorder with the row permutation */
1527   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1528                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1529                xGPU);
1530 
1531   /* First, solve U */
1532   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1533                         upTriFactorT->csrMat->num_rows,
1534                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1535                         upTriFactorT->csrMat->num_entries,
1536                       #endif
1537                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1538                         upTriFactorT->csrMat->values->data().get(),
1539                         upTriFactorT->csrMat->row_offsets->data().get(),
1540                         upTriFactorT->csrMat->column_indices->data().get(),
1541                         upTriFactorT->solveInfo,
1542                         xarray,
1543                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1544                         tempGPU->data().get(),
1545                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1546                       #else
1547                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1548                       #endif
1549 
1550   /* Then, solve L */
1551   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1552                         loTriFactorT->csrMat->num_rows,
1553                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554                         loTriFactorT->csrMat->num_entries,
1555                       #endif
1556                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1557                         loTriFactorT->csrMat->values->data().get(),
1558                         loTriFactorT->csrMat->row_offsets->data().get(),
1559                         loTriFactorT->csrMat->column_indices->data().get(),
1560                         loTriFactorT->solveInfo,
1561                         tempGPU->data().get(),
1562                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563                         xarray,
1564                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1565                       #else
1566                          xarray);CHKERRCUSPARSE(stat);
1567                       #endif
1568 
1569   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1570   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1571                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1572                tempGPU->begin());
1573 
1574   /* Copy the temporary to the full solution. */
1575   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1576 
1577   /* restore */
1578   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1580   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1581   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1582   PetscFunctionReturn(0);
1583 }
1584 
1585 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1586 {
1587   const PetscScalar                 *barray;
1588   PetscScalar                       *xarray;
1589   cusparseStatus_t                  stat;
1590   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1593   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1594   PetscErrorCode                    ierr;
1595 
1596   PetscFunctionBegin;
1597   /* Analyze the matrix and create the transpose ... on the fly */
1598   if (!loTriFactorT && !upTriFactorT) {
1599     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1600     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1601     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1602   }
1603 
1604   /* Get the GPU pointers */
1605   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1606   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1607 
1608   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1609   /* First, solve U */
1610   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1611                         upTriFactorT->csrMat->num_rows,
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                         upTriFactorT->csrMat->num_entries,
1614                       #endif
1615                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1616                         upTriFactorT->csrMat->values->data().get(),
1617                         upTriFactorT->csrMat->row_offsets->data().get(),
1618                         upTriFactorT->csrMat->column_indices->data().get(),
1619                         upTriFactorT->solveInfo,
1620                         barray,
1621                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1622                         tempGPU->data().get(),
1623                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1624                       #else
1625                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1626                       #endif
1627 
1628   /* Then, solve L */
1629   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1630                         loTriFactorT->csrMat->num_rows,
1631                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1632                         loTriFactorT->csrMat->num_entries,
1633                       #endif
1634                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1635                         loTriFactorT->csrMat->values->data().get(),
1636                         loTriFactorT->csrMat->row_offsets->data().get(),
1637                         loTriFactorT->csrMat->column_indices->data().get(),
1638                         loTriFactorT->solveInfo,
1639                         tempGPU->data().get(),
1640                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1641                         xarray,
1642                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1643                       #else
1644                         xarray);CHKERRCUSPARSE(stat);
1645                       #endif
1646 
1647   /* restore */
1648   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1649   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1650   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1651   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1656 {
1657   const PetscScalar                     *barray;
1658   PetscScalar                           *xarray;
1659   thrust::device_ptr<const PetscScalar> bGPU;
1660   thrust::device_ptr<PetscScalar>       xGPU;
1661   cusparseStatus_t                      stat;
1662   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1663   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1664   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1665   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1666   PetscErrorCode                        ierr;
1667 
1668   PetscFunctionBegin;
1669 
1670   /* Get the GPU pointers */
1671   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1672   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1673   xGPU = thrust::device_pointer_cast(xarray);
1674   bGPU = thrust::device_pointer_cast(barray);
1675 
1676   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1677   /* First, reorder with the row permutation */
1678   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1679                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1680                tempGPU->begin());
1681 
1682   /* Next, solve L */
1683   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1684                         loTriFactor->csrMat->num_rows,
1685                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1686                         loTriFactor->csrMat->num_entries,
1687                       #endif
1688                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1689                         loTriFactor->csrMat->values->data().get(),
1690                         loTriFactor->csrMat->row_offsets->data().get(),
1691                         loTriFactor->csrMat->column_indices->data().get(),
1692                         loTriFactor->solveInfo,
1693                         tempGPU->data().get(),
1694                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1695                          xarray,
1696                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1697                       #else
1698                          xarray);CHKERRCUSPARSE(stat);
1699                       #endif
1700 
1701   /* Then, solve U */
1702   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1703                         upTriFactor->csrMat->num_rows,
1704                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1705                         upTriFactor->csrMat->num_entries,
1706                       #endif
1707                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1708                         upTriFactor->csrMat->values->data().get(),
1709                         upTriFactor->csrMat->row_offsets->data().get(),
1710                         upTriFactor->csrMat->column_indices->data().get(),
1711                         upTriFactor->solveInfo,xarray,
1712                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713                         tempGPU->data().get(),
1714                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1715                       #else
1716                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1717                       #endif
1718 
1719   /* Last, reorder with the column permutation */
1720   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1721                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1722                xGPU);
1723 
1724   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1725   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1726   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1727   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1728   PetscFunctionReturn(0);
1729 }
1730 
1731 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1732 {
1733   const PetscScalar                 *barray;
1734   PetscScalar                       *xarray;
1735   cusparseStatus_t                  stat;
1736   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1737   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1738   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1739   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1740   PetscErrorCode                    ierr;
1741 
1742   PetscFunctionBegin;
1743   /* Get the GPU pointers */
1744   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1745   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1746 
1747   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1748   /* First, solve L */
1749   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1750                         loTriFactor->csrMat->num_rows,
1751                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1752                         loTriFactor->csrMat->num_entries,
1753                       #endif
1754                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1755                         loTriFactor->csrMat->values->data().get(),
1756                         loTriFactor->csrMat->row_offsets->data().get(),
1757                         loTriFactor->csrMat->column_indices->data().get(),
1758                         loTriFactor->solveInfo,
1759                         barray,
1760                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1761                         tempGPU->data().get(),
1762                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1763                       #else
1764                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1765                       #endif
1766 
1767   /* Next, solve U */
1768   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1769                         upTriFactor->csrMat->num_rows,
1770                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1771                         upTriFactor->csrMat->num_entries,
1772                       #endif
1773                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1774                         upTriFactor->csrMat->values->data().get(),
1775                         upTriFactor->csrMat->row_offsets->data().get(),
1776                         upTriFactor->csrMat->column_indices->data().get(),
1777                         upTriFactor->solveInfo,
1778                         tempGPU->data().get(),
1779                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1780                         xarray,
1781                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1782                       #else
1783                         xarray);CHKERRCUSPARSE(stat);
1784                       #endif
1785 
1786   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1787   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1788   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1789   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1790   PetscFunctionReturn(0);
1791 }
1792 
1793 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1794 {
1795   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1796   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1797   cudaError_t        cerr;
1798   PetscErrorCode     ierr;
1799 
1800   PetscFunctionBegin;
1801   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1802     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1803 
1804     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1805     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1806     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1807     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1808     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1809     A->offloadmask = PETSC_OFFLOAD_BOTH;
1810   }
1811   PetscFunctionReturn(0);
1812 }
1813 
1814 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1815 {
1816   PetscErrorCode ierr;
1817 
1818   PetscFunctionBegin;
1819   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1820   *array = ((Mat_SeqAIJ*)A->data)->a;
1821   PetscFunctionReturn(0);
1822 }
1823 
1824 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1825 {
1826   PetscFunctionBegin;
1827   A->offloadmask = PETSC_OFFLOAD_CPU;
1828   *array         = NULL;
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1833 {
1834   PetscErrorCode ierr;
1835 
1836   PetscFunctionBegin;
1837   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1838   *array = ((Mat_SeqAIJ*)A->data)->a;
1839   PetscFunctionReturn(0);
1840 }
1841 
1842 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1843 {
1844   PetscFunctionBegin;
1845   *array = NULL;
1846   PetscFunctionReturn(0);
1847 }
1848 
1849 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1850 {
1851   PetscFunctionBegin;
1852   *array = ((Mat_SeqAIJ*)A->data)->a;
1853   PetscFunctionReturn(0);
1854 }
1855 
1856 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1857 {
1858   PetscFunctionBegin;
1859   A->offloadmask = PETSC_OFFLOAD_CPU;
1860   *array         = NULL;
1861   PetscFunctionReturn(0);
1862 }
1863 
1864 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1865 {
1866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1868   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1869   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1870   PetscErrorCode               ierr;
1871   cusparseStatus_t             stat;
1872   PetscBool                    both = PETSC_TRUE;
1873   cudaError_t                  err;
1874 
1875   PetscFunctionBegin;
1876   PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1877   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1878     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1879       CsrMatrix *matrix;
1880       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1881 
1882       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1883       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1884       matrix->values->assign(a->a, a->a+a->nz);
1885       err  = WaitForCUDA();CHKERRCUDA(err);
1886       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1887       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1889     } else {
1890       PetscInt nnz;
1891       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1892       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1893       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1894       delete cusparsestruct->workVector;
1895       delete cusparsestruct->rowoffsets_gpu;
1896       cusparsestruct->workVector = NULL;
1897       cusparsestruct->rowoffsets_gpu = NULL;
1898       try {
1899         if (a->compressedrow.use) {
1900           m    = a->compressedrow.nrows;
1901           ii   = a->compressedrow.i;
1902           ridx = a->compressedrow.rindex;
1903         } else {
1904           m    = A->rmap->n;
1905           ii   = a->i;
1906           ridx = NULL;
1907         }
1908         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1909         PetscCheckFalse(m && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1910         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1911         else nnz = a->nz;
1912 
1913         /* create cusparse matrix */
1914         cusparsestruct->nrows = m;
1915         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1916         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1917         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1918         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1919 
1920         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1921         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1922         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1923         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1924         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1925         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1926         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1927 
1928         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1929         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1930           /* set the matrix */
1931           CsrMatrix *mat= new CsrMatrix;
1932           mat->num_rows = m;
1933           mat->num_cols = A->cmap->n;
1934           mat->num_entries = nnz;
1935           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1936           mat->row_offsets->assign(ii, ii + m+1);
1937 
1938           mat->column_indices = new THRUSTINTARRAY32(nnz);
1939           mat->column_indices->assign(a->j, a->j+nnz);
1940 
1941           mat->values = new THRUSTARRAY(nnz);
1942           if (a->a) mat->values->assign(a->a, a->a+nnz);
1943 
1944           /* assign the pointer */
1945           matstruct->mat = mat;
1946          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1947           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1948             stat = cusparseCreateCsr(&matstruct->matDescr,
1949                                     mat->num_rows, mat->num_cols, mat->num_entries,
1950                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1951                                     mat->values->data().get(),
1952                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1953                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1954           }
1955          #endif
1956         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1957          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1959          #else
1960           CsrMatrix *mat= new CsrMatrix;
1961           mat->num_rows = m;
1962           mat->num_cols = A->cmap->n;
1963           mat->num_entries = nnz;
1964           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1965           mat->row_offsets->assign(ii, ii + m+1);
1966 
1967           mat->column_indices = new THRUSTINTARRAY32(nnz);
1968           mat->column_indices->assign(a->j, a->j+nnz);
1969 
1970           mat->values = new THRUSTARRAY(nnz);
1971           if (a->a) mat->values->assign(a->a, a->a+nnz);
1972 
1973           cusparseHybMat_t hybMat;
1974           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1975           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1976             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1977           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1978               matstruct->descr, mat->values->data().get(),
1979               mat->row_offsets->data().get(),
1980               mat->column_indices->data().get(),
1981               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1982           /* assign the pointer */
1983           matstruct->mat = hybMat;
1984 
1985           if (mat) {
1986             if (mat->values) delete (THRUSTARRAY*)mat->values;
1987             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1988             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1989             delete (CsrMatrix*)mat;
1990           }
1991          #endif
1992         }
1993 
1994         /* assign the compressed row indices */
1995         if (a->compressedrow.use) {
1996           cusparsestruct->workVector = new THRUSTARRAY(m);
1997           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1998           matstruct->cprowIndices->assign(ridx,ridx+m);
1999           tmp = m;
2000         } else {
2001           cusparsestruct->workVector = NULL;
2002           matstruct->cprowIndices    = NULL;
2003           tmp = 0;
2004         }
2005         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2006 
2007         /* assign the pointer */
2008         cusparsestruct->mat = matstruct;
2009       } catch(char *ex) {
2010         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2011       }
2012       err  = WaitForCUDA();CHKERRCUDA(err);
2013       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2014       cusparsestruct->nonzerostate = A->nonzerostate;
2015     }
2016     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2017   }
2018   PetscFunctionReturn(0);
2019 }
2020 
2021 struct VecCUDAPlusEquals
2022 {
2023   template <typename Tuple>
2024   __host__ __device__
2025   void operator()(Tuple t)
2026   {
2027     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2028   }
2029 };
2030 
2031 struct VecCUDAEquals
2032 {
2033   template <typename Tuple>
2034   __host__ __device__
2035   void operator()(Tuple t)
2036   {
2037     thrust::get<1>(t) = thrust::get<0>(t);
2038   }
2039 };
2040 
2041 struct VecCUDAEqualsReverse
2042 {
2043   template <typename Tuple>
2044   __host__ __device__
2045   void operator()(Tuple t)
2046   {
2047     thrust::get<0>(t) = thrust::get<1>(t);
2048   }
2049 };
2050 
2051 struct MatMatCusparse {
2052   PetscBool             cisdense;
2053   PetscScalar           *Bt;
2054   Mat                   X;
2055   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2056   PetscLogDouble        flops;
2057   CsrMatrix             *Bcsr;
2058 
2059 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2060   cusparseSpMatDescr_t  matSpBDescr;
2061   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2062   cusparseDnMatDescr_t  matBDescr;
2063   cusparseDnMatDescr_t  matCDescr;
2064   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2065  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2066   void                  *dBuffer4;
2067   void                  *dBuffer5;
2068  #endif
2069   size_t                mmBufferSize;
2070   void                  *mmBuffer;
2071   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2072   cusparseSpGEMMDescr_t spgemmDesc;
2073 #endif
2074 };
2075 
2076 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2077 {
2078   PetscErrorCode   ierr;
2079   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2080   cudaError_t      cerr;
2081  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2082   cusparseStatus_t stat;
2083  #endif
2084 
2085   PetscFunctionBegin;
2086   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2087   delete mmdata->Bcsr;
2088  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2089   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2090   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2091   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2092   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2093  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2094   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2095   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2096  #endif
2097   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2098   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2099  #endif
2100   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2101   ierr = PetscFree(data);CHKERRQ(ierr);
2102   PetscFunctionReturn(0);
2103 }
2104 
2105 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2106 
2107 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2108 {
2109   Mat_Product                  *product = C->product;
2110   Mat                          A,B;
2111   PetscInt                     m,n,blda,clda;
2112   PetscBool                    flg,biscuda;
2113   Mat_SeqAIJCUSPARSE           *cusp;
2114   cusparseStatus_t             stat;
2115   cusparseOperation_t          opA;
2116   const PetscScalar            *barray;
2117   PetscScalar                  *carray;
2118   PetscErrorCode               ierr;
2119   MatMatCusparse               *mmdata;
2120   Mat_SeqAIJCUSPARSEMultStruct *mat;
2121   CsrMatrix                    *csrmat;
2122 
2123   PetscFunctionBegin;
2124   MatCheckProduct(C,1);
2125   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2126   mmdata = (MatMatCusparse*)product->data;
2127   A    = product->A;
2128   B    = product->B;
2129   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2130   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2131   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2132      Instead of silently accepting the wrong answer, I prefer to raise the error */
2133   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2134   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2135   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2136   switch (product->type) {
2137   case MATPRODUCT_AB:
2138   case MATPRODUCT_PtAP:
2139     mat = cusp->mat;
2140     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2141     m   = A->rmap->n;
2142     n   = B->cmap->n;
2143     break;
2144   case MATPRODUCT_AtB:
2145     if (!A->form_explicit_transpose) {
2146       mat = cusp->mat;
2147       opA = CUSPARSE_OPERATION_TRANSPOSE;
2148     } else {
2149       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2150       mat  = cusp->matTranspose;
2151       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2152     }
2153     m = A->cmap->n;
2154     n = B->cmap->n;
2155     break;
2156   case MATPRODUCT_ABt:
2157   case MATPRODUCT_RARt:
2158     mat = cusp->mat;
2159     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2160     m   = A->rmap->n;
2161     n   = B->rmap->n;
2162     break;
2163   default:
2164     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2165   }
2166   PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2167   csrmat = (CsrMatrix*)mat->mat;
2168   /* if the user passed a CPU matrix, copy the data to the GPU */
2169   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2170   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2171   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2172 
2173   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2174   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2175     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2176     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2177   } else {
2178     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2179     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2180   }
2181 
2182   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2183  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2184   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2185   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2186   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2187     size_t mmBufferSize;
2188     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2189     if (!mmdata->matBDescr) {
2190       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2191       mmdata->Blda = blda;
2192     }
2193 
2194     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2195     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2196       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2197       mmdata->Clda = clda;
2198     }
2199 
2200     if (!mat->matDescr) {
2201       stat = cusparseCreateCsr(&mat->matDescr,
2202                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2203                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2204                                csrmat->values->data().get(),
2205                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2206                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2207     }
2208     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2209                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2210                                    mmdata->matCDescr,cusparse_scalartype,
2211                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2212     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2213       cudaError_t cerr;
2214       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2215       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2216       mmdata->mmBufferSize = mmBufferSize;
2217     }
2218     mmdata->initialized = PETSC_TRUE;
2219   } else {
2220     /* to be safe, always update pointers of the mats */
2221     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2222     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2223     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2224   }
2225 
2226   /* do cusparseSpMM, which supports transpose on B */
2227   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2228                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2229                       mmdata->matCDescr,cusparse_scalartype,
2230                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2231  #else
2232   PetscInt k;
2233   /* cusparseXcsrmm does not support transpose on B */
2234   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2235     cublasHandle_t cublasv2handle;
2236     cublasStatus_t cerr;
2237 
2238     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2239     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2240                        B->cmap->n,B->rmap->n,
2241                        &PETSC_CUSPARSE_ONE ,barray,blda,
2242                        &PETSC_CUSPARSE_ZERO,barray,blda,
2243                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2244     blda = B->cmap->n;
2245     k    = B->cmap->n;
2246   } else {
2247     k    = B->rmap->n;
2248   }
2249 
2250   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2251   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2252                            csrmat->num_entries,mat->alpha_one,mat->descr,
2253                            csrmat->values->data().get(),
2254                            csrmat->row_offsets->data().get(),
2255                            csrmat->column_indices->data().get(),
2256                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2257                            carray,clda);CHKERRCUSPARSE(stat);
2258  #endif
2259   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2260   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2261   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2262   if (product->type == MATPRODUCT_RARt) {
2263     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2264     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2265   } else if (product->type == MATPRODUCT_PtAP) {
2266     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2267     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2268   } else {
2269     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2270   }
2271   if (mmdata->cisdense) {
2272     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2273   }
2274   if (!biscuda) {
2275     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2276   }
2277   PetscFunctionReturn(0);
2278 }
2279 
2280 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2281 {
2282   Mat_Product        *product = C->product;
2283   Mat                A,B;
2284   PetscInt           m,n;
2285   PetscBool          cisdense,flg;
2286   PetscErrorCode     ierr;
2287   MatMatCusparse     *mmdata;
2288   Mat_SeqAIJCUSPARSE *cusp;
2289 
2290   PetscFunctionBegin;
2291   MatCheckProduct(C,1);
2292   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2293   A    = product->A;
2294   B    = product->B;
2295   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2296   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2297   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2298   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2299   switch (product->type) {
2300   case MATPRODUCT_AB:
2301     m = A->rmap->n;
2302     n = B->cmap->n;
2303     break;
2304   case MATPRODUCT_AtB:
2305     m = A->cmap->n;
2306     n = B->cmap->n;
2307     break;
2308   case MATPRODUCT_ABt:
2309     m = A->rmap->n;
2310     n = B->rmap->n;
2311     break;
2312   case MATPRODUCT_PtAP:
2313     m = B->cmap->n;
2314     n = B->cmap->n;
2315     break;
2316   case MATPRODUCT_RARt:
2317     m = B->rmap->n;
2318     n = B->rmap->n;
2319     break;
2320   default:
2321     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2322   }
2323   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2324   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2325   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2326   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2327 
2328   /* product data */
2329   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2330   mmdata->cisdense = cisdense;
2331  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2332   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2333   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2334     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2335   }
2336  #endif
2337   /* for these products we need intermediate storage */
2338   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2339     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2340     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2341     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2342       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2343     } else {
2344       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2345     }
2346   }
2347   C->product->data    = mmdata;
2348   C->product->destroy = MatDestroy_MatMatCusparse;
2349 
2350   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2351   PetscFunctionReturn(0);
2352 }
2353 
2354 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2355 {
2356   Mat_Product                  *product = C->product;
2357   Mat                          A,B;
2358   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2359   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2360   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2361   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2362   PetscBool                    flg;
2363   PetscErrorCode               ierr;
2364   cusparseStatus_t             stat;
2365   cudaError_t                  cerr;
2366   MatProductType               ptype;
2367   MatMatCusparse               *mmdata;
2368 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2369   cusparseSpMatDescr_t         BmatSpDescr;
2370 #endif
2371   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2372 
2373   PetscFunctionBegin;
2374   MatCheckProduct(C,1);
2375   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2376   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2377   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2378   mmdata = (MatMatCusparse*)C->product->data;
2379   A = product->A;
2380   B = product->B;
2381   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2382     mmdata->reusesym = PETSC_FALSE;
2383     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2384     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2385     Cmat = Ccusp->mat;
2386     PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2387     Ccsr = (CsrMatrix*)Cmat->mat;
2388     PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2389     goto finalize;
2390   }
2391   if (!c->nz) goto finalize;
2392   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2393   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2394   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2395   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2396   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2397   PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2398   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2399   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2400   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2401   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2402   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2404   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2405   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2406 
2407   ptype = product->type;
2408   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2409     ptype = MATPRODUCT_AB;
2410     PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2411   }
2412   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2413     ptype = MATPRODUCT_AB;
2414     PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2415   }
2416   switch (ptype) {
2417   case MATPRODUCT_AB:
2418     Amat = Acusp->mat;
2419     Bmat = Bcusp->mat;
2420     break;
2421   case MATPRODUCT_AtB:
2422     Amat = Acusp->matTranspose;
2423     Bmat = Bcusp->mat;
2424     break;
2425   case MATPRODUCT_ABt:
2426     Amat = Acusp->mat;
2427     Bmat = Bcusp->matTranspose;
2428     break;
2429   default:
2430     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2431   }
2432   Cmat = Ccusp->mat;
2433   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2434   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2435   PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2436   Acsr = (CsrMatrix*)Amat->mat;
2437   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2438   Ccsr = (CsrMatrix*)Cmat->mat;
2439   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2440   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2441   PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2442   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2443 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2444   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2445   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2446   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2447     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2448                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2449                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2450                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2451   #else
2452     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2453                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2454                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2455                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2456     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2457                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2458                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2459   #endif
2460 #else
2461   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2462                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2463                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2464                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2465                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2466 #endif
2467   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2468   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2469   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2470   C->offloadmask = PETSC_OFFLOAD_GPU;
2471 finalize:
2472   /* shorter version of MatAssemblyEnd_SeqAIJ */
2473   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2474   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2475   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2476   c->reallocs         = 0;
2477   C->info.mallocs    += 0;
2478   C->info.nz_unneeded = 0;
2479   C->assembled = C->was_assembled = PETSC_TRUE;
2480   C->num_ass++;
2481   PetscFunctionReturn(0);
2482 }
2483 
2484 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2485 {
2486   Mat_Product                  *product = C->product;
2487   Mat                          A,B;
2488   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2489   Mat_SeqAIJ                   *a,*b,*c;
2490   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2491   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2492   PetscInt                     i,j,m,n,k;
2493   PetscBool                    flg;
2494   PetscErrorCode               ierr;
2495   cusparseStatus_t             stat;
2496   cudaError_t                  cerr;
2497   MatProductType               ptype;
2498   MatMatCusparse               *mmdata;
2499   PetscLogDouble               flops;
2500   PetscBool                    biscompressed,ciscompressed;
2501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2502   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2503   cusparseSpMatDescr_t         BmatSpDescr;
2504 #else
2505   int                          cnz;
2506 #endif
2507   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2508 
2509   PetscFunctionBegin;
2510   MatCheckProduct(C,1);
2511   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2512   A    = product->A;
2513   B    = product->B;
2514   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2515   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2516   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2517   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2518   a = (Mat_SeqAIJ*)A->data;
2519   b = (Mat_SeqAIJ*)B->data;
2520   /* product data */
2521   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2522   C->product->data    = mmdata;
2523   C->product->destroy = MatDestroy_MatMatCusparse;
2524 
2525   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2526   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2527   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2528   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2529   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2530   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2531 
2532   ptype = product->type;
2533   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2534     ptype = MATPRODUCT_AB;
2535     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2536   }
2537   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2538     ptype = MATPRODUCT_AB;
2539     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2540   }
2541   biscompressed = PETSC_FALSE;
2542   ciscompressed = PETSC_FALSE;
2543   switch (ptype) {
2544   case MATPRODUCT_AB:
2545     m = A->rmap->n;
2546     n = B->cmap->n;
2547     k = A->cmap->n;
2548     Amat = Acusp->mat;
2549     Bmat = Bcusp->mat;
2550     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2551     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2552     break;
2553   case MATPRODUCT_AtB:
2554     m = A->cmap->n;
2555     n = B->cmap->n;
2556     k = A->rmap->n;
2557     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2558     Amat = Acusp->matTranspose;
2559     Bmat = Bcusp->mat;
2560     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2561     break;
2562   case MATPRODUCT_ABt:
2563     m = A->rmap->n;
2564     n = B->rmap->n;
2565     k = A->cmap->n;
2566     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2567     Amat = Acusp->mat;
2568     Bmat = Bcusp->matTranspose;
2569     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2570     break;
2571   default:
2572     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2573   }
2574 
2575   /* create cusparse matrix */
2576   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2577   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2578   c     = (Mat_SeqAIJ*)C->data;
2579   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2580   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2581   Ccsr  = new CsrMatrix;
2582 
2583   c->compressedrow.use = ciscompressed;
2584   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2585     c->compressedrow.nrows = a->compressedrow.nrows;
2586     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2587     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2588     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2589     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2590     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2591   } else {
2592     c->compressedrow.nrows  = 0;
2593     c->compressedrow.i      = NULL;
2594     c->compressedrow.rindex = NULL;
2595     Ccusp->workVector       = NULL;
2596     Cmat->cprowIndices      = NULL;
2597   }
2598   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2599   Ccusp->mat      = Cmat;
2600   Ccusp->mat->mat = Ccsr;
2601   Ccsr->num_rows    = Ccusp->nrows;
2602   Ccsr->num_cols    = n;
2603   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2604   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2605   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2606   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2607   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2608   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2609   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2610   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2611   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2613   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2614     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2615     c->nz = 0;
2616     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2617     Ccsr->values = new THRUSTARRAY(c->nz);
2618     goto finalizesym;
2619   }
2620 
2621   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2622   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2623   Acsr = (CsrMatrix*)Amat->mat;
2624   if (!biscompressed) {
2625     Bcsr = (CsrMatrix*)Bmat->mat;
2626 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2627     BmatSpDescr = Bmat->matDescr;
2628 #endif
2629   } else { /* we need to use row offsets for the full matrix */
2630     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2631     Bcsr = new CsrMatrix;
2632     Bcsr->num_rows       = B->rmap->n;
2633     Bcsr->num_cols       = cBcsr->num_cols;
2634     Bcsr->num_entries    = cBcsr->num_entries;
2635     Bcsr->column_indices = cBcsr->column_indices;
2636     Bcsr->values         = cBcsr->values;
2637     if (!Bcusp->rowoffsets_gpu) {
2638       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2639       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2640       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2641     }
2642     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2643     mmdata->Bcsr = Bcsr;
2644 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2645     if (Bcsr->num_rows && Bcsr->num_cols) {
2646       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2647                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2648                                Bcsr->values->data().get(),
2649                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2650                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2651     }
2652     BmatSpDescr = mmdata->matSpBDescr;
2653 #endif
2654   }
2655   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2656   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2657   /* precompute flops count */
2658   if (ptype == MATPRODUCT_AB) {
2659     for (i=0, flops = 0; i<A->rmap->n; i++) {
2660       const PetscInt st = a->i[i];
2661       const PetscInt en = a->i[i+1];
2662       for (j=st; j<en; j++) {
2663         const PetscInt brow = a->j[j];
2664         flops += 2.*(b->i[brow+1] - b->i[brow]);
2665       }
2666     }
2667   } else if (ptype == MATPRODUCT_AtB) {
2668     for (i=0, flops = 0; i<A->rmap->n; i++) {
2669       const PetscInt anzi = a->i[i+1] - a->i[i];
2670       const PetscInt bnzi = b->i[i+1] - b->i[i];
2671       flops += (2.*anzi)*bnzi;
2672     }
2673   } else { /* TODO */
2674     flops = 0.;
2675   }
2676 
2677   mmdata->flops = flops;
2678   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2679 
2680 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2681   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2682   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2683                           NULL, NULL, NULL,
2684                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2685                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2686   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2687  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2688  {
2689   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2690      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2691   */
2692   void*  dBuffer1 = NULL;
2693   void*  dBuffer2 = NULL;
2694   void*  dBuffer3 = NULL;
2695   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2696   size_t bufferSize1 = 0;
2697   size_t bufferSize2 = 0;
2698   size_t bufferSize3 = 0;
2699   size_t bufferSize4 = 0;
2700   size_t bufferSize5 = 0;
2701 
2702   /*----------------------------------------------------------------------*/
2703   /* ask bufferSize1 bytes for external memory */
2704   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2705                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2706                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2707   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2708   /* inspect the matrices A and B to understand the memory requirement for the next step */
2709   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2710                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2711                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2712 
2713   /*----------------------------------------------------------------------*/
2714   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2715                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2716                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2717   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2718   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2719   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2720   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2721                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2722                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2723   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2724   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2725 
2726   /*----------------------------------------------------------------------*/
2727   /* get matrix C non-zero entries C_nnz1 */
2728   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2729   c->nz = (PetscInt) C_nnz1;
2730   /* allocate matrix C */
2731   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2732   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2733   /* update matC with the new pointers */
2734   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2735                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2736 
2737   /*----------------------------------------------------------------------*/
2738   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2739                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2740                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2741   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2742   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2743                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2744                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2745   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2746   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2747                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2748                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2749                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2750   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2751  }
2752  #else
2753   size_t bufSize2;
2754   /* ask bufferSize bytes for external memory */
2755   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2756                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2757                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2758                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2759   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2760   /* inspect the matrices A and B to understand the memory requirement for the next step */
2761   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2762                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2763                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2764                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2765   /* ask bufferSize again bytes for external memory */
2766   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2767                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2768                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2769                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2770   /* The CUSPARSE documentation is not clear, nor the API
2771      We need both buffers to perform the operations properly!
2772      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2773      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2774      is stored in the descriptor! What a messy API... */
2775   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2776   /* compute the intermediate product of A * B */
2777   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2778                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2779                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2780                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2781   /* get matrix C non-zero entries C_nnz1 */
2782   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2783   c->nz = (PetscInt) C_nnz1;
2784   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2785   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2786   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2787   Ccsr->values = new THRUSTARRAY(c->nz);
2788   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2789   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2790                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2791   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2792                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2793                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2794  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2795 #else
2796   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2797   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2798                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2799                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2800                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2801                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2802   c->nz = cnz;
2803   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2804   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2805   Ccsr->values = new THRUSTARRAY(c->nz);
2806   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2807 
2808   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2809   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2810      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2811      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2812   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2813                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2814                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2815                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2816                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2817 #endif
2818   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2819   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2820 finalizesym:
2821   c->singlemalloc = PETSC_FALSE;
2822   c->free_a       = PETSC_TRUE;
2823   c->free_ij      = PETSC_TRUE;
2824   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2825   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2826   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2827     PetscInt *d_i = c->i;
2828     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2829     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2830     ii   = *Ccsr->row_offsets;
2831     jj   = *Ccsr->column_indices;
2832     if (ciscompressed) d_i = c->compressedrow.i;
2833     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2834     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2835   } else {
2836     PetscInt *d_i = c->i;
2837     if (ciscompressed) d_i = c->compressedrow.i;
2838     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2840   }
2841   if (ciscompressed) { /* need to expand host row offsets */
2842     PetscInt r = 0;
2843     c->i[0] = 0;
2844     for (k = 0; k < c->compressedrow.nrows; k++) {
2845       const PetscInt next = c->compressedrow.rindex[k];
2846       const PetscInt old = c->compressedrow.i[k];
2847       for (; r < next; r++) c->i[r+1] = old;
2848     }
2849     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2850   }
2851   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2852   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2853   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2854   c->maxnz = c->nz;
2855   c->nonzerorowcnt = 0;
2856   c->rmax = 0;
2857   for (k = 0; k < m; k++) {
2858     const PetscInt nn = c->i[k+1] - c->i[k];
2859     c->ilen[k] = c->imax[k] = nn;
2860     c->nonzerorowcnt += (PetscInt)!!nn;
2861     c->rmax = PetscMax(c->rmax,nn);
2862   }
2863   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2864   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2865   Ccsr->num_entries = c->nz;
2866 
2867   C->nonzerostate++;
2868   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2869   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2870   Ccusp->nonzerostate = C->nonzerostate;
2871   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2872   C->preallocated  = PETSC_TRUE;
2873   C->assembled     = PETSC_FALSE;
2874   C->was_assembled = PETSC_FALSE;
2875   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2876     mmdata->reusesym = PETSC_TRUE;
2877     C->offloadmask   = PETSC_OFFLOAD_GPU;
2878   }
2879   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2880   PetscFunctionReturn(0);
2881 }
2882 
2883 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2884 
2885 /* handles sparse or dense B */
2886 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2887 {
2888   Mat_Product    *product = mat->product;
2889   PetscErrorCode ierr;
2890   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2891 
2892   PetscFunctionBegin;
2893   MatCheckProduct(mat,1);
2894   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2895   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2896     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2897   }
2898   if (product->type == MATPRODUCT_ABC) {
2899     Ciscusp = PETSC_FALSE;
2900     if (!product->C->boundtocpu) {
2901       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2902     }
2903   }
2904   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2905     PetscBool usecpu = PETSC_FALSE;
2906     switch (product->type) {
2907     case MATPRODUCT_AB:
2908       if (product->api_user) {
2909         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2910         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2911         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2912       } else {
2913         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2914         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2915         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2916       }
2917       break;
2918     case MATPRODUCT_AtB:
2919       if (product->api_user) {
2920         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2921         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2922         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2923       } else {
2924         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2925         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2926         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2927       }
2928       break;
2929     case MATPRODUCT_PtAP:
2930       if (product->api_user) {
2931         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2932         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2933         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2934       } else {
2935         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2936         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2937         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2938       }
2939       break;
2940     case MATPRODUCT_RARt:
2941       if (product->api_user) {
2942         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2943         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2944         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2945       } else {
2946         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2947         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2948         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2949       }
2950       break;
2951     case MATPRODUCT_ABC:
2952       if (product->api_user) {
2953         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2954         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2955         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2956       } else {
2957         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2958         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2959         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2960       }
2961       break;
2962     default:
2963       break;
2964     }
2965     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2966   }
2967   /* dispatch */
2968   if (isdense) {
2969     switch (product->type) {
2970     case MATPRODUCT_AB:
2971     case MATPRODUCT_AtB:
2972     case MATPRODUCT_ABt:
2973     case MATPRODUCT_PtAP:
2974     case MATPRODUCT_RARt:
2975      if (product->A->boundtocpu) {
2976         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2977       } else {
2978         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2979       }
2980       break;
2981     case MATPRODUCT_ABC:
2982       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2983       break;
2984     default:
2985       break;
2986     }
2987   } else if (Biscusp && Ciscusp) {
2988     switch (product->type) {
2989     case MATPRODUCT_AB:
2990     case MATPRODUCT_AtB:
2991     case MATPRODUCT_ABt:
2992       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2993       break;
2994     case MATPRODUCT_PtAP:
2995     case MATPRODUCT_RARt:
2996     case MATPRODUCT_ABC:
2997       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2998       break;
2999     default:
3000       break;
3001     }
3002   } else { /* fallback for AIJ */
3003     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3004   }
3005   PetscFunctionReturn(0);
3006 }
3007 
3008 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3009 {
3010   PetscErrorCode ierr;
3011 
3012   PetscFunctionBegin;
3013   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3014   PetscFunctionReturn(0);
3015 }
3016 
3017 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3018 {
3019   PetscErrorCode ierr;
3020 
3021   PetscFunctionBegin;
3022   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3023   PetscFunctionReturn(0);
3024 }
3025 
3026 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3027 {
3028   PetscErrorCode ierr;
3029 
3030   PetscFunctionBegin;
3031   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3032   PetscFunctionReturn(0);
3033 }
3034 
3035 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3036 {
3037   PetscErrorCode ierr;
3038 
3039   PetscFunctionBegin;
3040   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3041   PetscFunctionReturn(0);
3042 }
3043 
3044 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3045 {
3046   PetscErrorCode ierr;
3047 
3048   PetscFunctionBegin;
3049   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3050   PetscFunctionReturn(0);
3051 }
3052 
3053 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3054 {
3055   int i = blockIdx.x*blockDim.x + threadIdx.x;
3056   if (i < n) y[idx[i]] += x[i];
3057 }
3058 
3059 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3060 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3061 {
3062   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3063   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3064   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3065   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3066   PetscErrorCode               ierr;
3067   cusparseStatus_t             stat;
3068   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3069   PetscBool                    compressed;
3070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3071   PetscInt                     nx,ny;
3072 #endif
3073 
3074   PetscFunctionBegin;
3075   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3076   if (!a->nonzerorowcnt) {
3077     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3078     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3079     PetscFunctionReturn(0);
3080   }
3081   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3082   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3083   if (!trans) {
3084     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3085     PetscCheckFalse(!matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3086   } else {
3087     if (herm || !A->form_explicit_transpose) {
3088       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3089       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3090     } else {
3091       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3092       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3093     }
3094   }
3095   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3096   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3097 
3098   try {
3099     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3100     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3101     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3102 
3103     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3104     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3105       /* z = A x + beta y.
3106          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3107          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3108       */
3109       xptr = xarray;
3110       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3111       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3112      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3113       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3114           allocated to accommodate different uses. So we get the length info directly from mat.
3115        */
3116       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3117         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3118         nx = mat->num_cols;
3119         ny = mat->num_rows;
3120       }
3121      #endif
3122     } else {
3123       /* z = A^T x + beta y
3124          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3125          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3126        */
3127       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3128       dptr = zarray;
3129       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3130       if (compressed) { /* Scatter x to work vector */
3131         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3132         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3133                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3134                          VecCUDAEqualsReverse());
3135       }
3136      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3137       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3138         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3139         nx = mat->num_rows;
3140         ny = mat->num_cols;
3141       }
3142      #endif
3143     }
3144 
3145     /* csr_spmv does y = alpha op(A) x + beta y */
3146     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3147      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3148       PetscCheckFalse(opA < 0 || opA > 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3149       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3150         cudaError_t cerr;
3151         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3152         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3153         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3154                                 matstruct->matDescr,
3155                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3156                                 matstruct->cuSpMV[opA].vecYDescr,
3157                                 cusparse_scalartype,
3158                                 cusparsestruct->spmvAlg,
3159                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3160         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3161 
3162         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3163       } else {
3164         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3165         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3166         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3167       }
3168 
3169       stat = cusparseSpMV(cusparsestruct->handle, opA,
3170                                matstruct->alpha_one,
3171                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3172                                matstruct->cuSpMV[opA].vecXDescr,
3173                                beta,
3174                                matstruct->cuSpMV[opA].vecYDescr,
3175                                cusparse_scalartype,
3176                                cusparsestruct->spmvAlg,
3177                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3178      #else
3179       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3180       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3181                                mat->num_rows, mat->num_cols,
3182                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3183                                mat->values->data().get(), mat->row_offsets->data().get(),
3184                                mat->column_indices->data().get(), xptr, beta,
3185                                dptr);CHKERRCUSPARSE(stat);
3186      #endif
3187     } else {
3188       if (cusparsestruct->nrows) {
3189        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3190         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3191        #else
3192         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3193         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3194                                  matstruct->alpha_one, matstruct->descr, hybMat,
3195                                  xptr, beta,
3196                                  dptr);CHKERRCUSPARSE(stat);
3197        #endif
3198       }
3199     }
3200     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3201 
3202     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3203       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3204         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3205           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3206         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3207           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3208         }
3209       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3210         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3211       }
3212 
3213       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3214       if (compressed) {
3215         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3216         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3217            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3218            prevent that. So I just add a ScatterAdd kernel.
3219          */
3220        #if 0
3221         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3222         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3223                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3224                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3225                          VecCUDAPlusEquals());
3226        #else
3227         PetscInt n = matstruct->cprowIndices->size();
3228         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3229        #endif
3230         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3231       }
3232     } else {
3233       if (yy && yy != zz) {
3234         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3235       }
3236     }
3237     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3238     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3239     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3240   } catch(char *ex) {
3241     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3242   }
3243   if (yy) {
3244     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3245   } else {
3246     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3247   }
3248   PetscFunctionReturn(0);
3249 }
3250 
3251 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3252 {
3253   PetscErrorCode ierr;
3254 
3255   PetscFunctionBegin;
3256   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3257   PetscFunctionReturn(0);
3258 }
3259 
3260 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3261 {
3262   PetscErrorCode     ierr;
3263   PetscObjectState   onnz = A->nonzerostate;
3264   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3265 
3266   PetscFunctionBegin;
3267   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3268   if (onnz != A->nonzerostate && cusp->deviceMat) {
3269     cudaError_t cerr;
3270 
3271     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3272     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3273     cusp->deviceMat = NULL;
3274   }
3275   PetscFunctionReturn(0);
3276 }
3277 
3278 /* --------------------------------------------------------------------------------*/
3279 /*@
3280    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3281    (the default parallel PETSc format). This matrix will ultimately pushed down
3282    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3283    assembly performance the user should preallocate the matrix storage by setting
3284    the parameter nz (or the array nnz).  By setting these parameters accurately,
3285    performance during matrix assembly can be increased by more than a factor of 50.
3286 
3287    Collective
3288 
3289    Input Parameters:
3290 +  comm - MPI communicator, set to PETSC_COMM_SELF
3291 .  m - number of rows
3292 .  n - number of columns
3293 .  nz - number of nonzeros per row (same for all rows)
3294 -  nnz - array containing the number of nonzeros in the various rows
3295          (possibly different for each row) or NULL
3296 
3297    Output Parameter:
3298 .  A - the matrix
3299 
3300    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3301    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3302    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3303 
3304    Notes:
3305    If nnz is given then nz is ignored
3306 
3307    The AIJ format (also called the Yale sparse matrix format or
3308    compressed row storage), is fully compatible with standard Fortran 77
3309    storage.  That is, the stored row and column indices can begin at
3310    either one (as in Fortran) or zero.  See the users' manual for details.
3311 
3312    Specify the preallocated storage with either nz or nnz (not both).
3313    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3314    allocation.  For large problems you MUST preallocate memory or you
3315    will get TERRIBLE performance, see the users' manual chapter on matrices.
3316 
3317    By default, this format uses inodes (identical nodes) when possible, to
3318    improve numerical efficiency of matrix-vector products and solves. We
3319    search for consecutive rows with the same nonzero structure, thereby
3320    reusing matrix information to achieve increased efficiency.
3321 
3322    Level: intermediate
3323 
3324 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3325 @*/
3326 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3327 {
3328   PetscErrorCode ierr;
3329 
3330   PetscFunctionBegin;
3331   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3332   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3333   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3334   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3335   PetscFunctionReturn(0);
3336 }
3337 
3338 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3339 {
3340   PetscErrorCode ierr;
3341 
3342   PetscFunctionBegin;
3343   if (A->factortype == MAT_FACTOR_NONE) {
3344     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3345   } else {
3346     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3347   }
3348   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3349   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3350   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3351   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3357   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3358   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3359   PetscFunctionReturn(0);
3360 }
3361 
3362 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3363 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3364 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3365 {
3366   PetscErrorCode ierr;
3367 
3368   PetscFunctionBegin;
3369   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3370   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3371   PetscFunctionReturn(0);
3372 }
3373 
3374 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3375 {
3376   PetscErrorCode     ierr;
3377   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3378   Mat_SeqAIJCUSPARSE *cy;
3379   Mat_SeqAIJCUSPARSE *cx;
3380   PetscScalar        *ay;
3381   const PetscScalar  *ax;
3382   CsrMatrix          *csry,*csrx;
3383 
3384   PetscFunctionBegin;
3385   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3386   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3387   if (X->ops->axpy != Y->ops->axpy) {
3388     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3389     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3390     PetscFunctionReturn(0);
3391   }
3392   /* if we are here, it means both matrices are bound to GPU */
3393   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3394   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3395   PetscCheckFalse(cy->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3396   PetscCheckFalse(cx->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3397   csry = (CsrMatrix*)cy->mat->mat;
3398   csrx = (CsrMatrix*)cx->mat->mat;
3399   /* see if we can turn this into a cublas axpy */
3400   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3401     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3402     if (eq) {
3403       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3404     }
3405     if (eq) str = SAME_NONZERO_PATTERN;
3406   }
3407   /* spgeam is buggy with one column */
3408   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3409 
3410   if (str == SUBSET_NONZERO_PATTERN) {
3411     cusparseStatus_t stat;
3412     PetscScalar      b = 1.0;
3413 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3414     size_t           bufferSize;
3415     void             *buffer;
3416     cudaError_t      cerr;
3417 #endif
3418 
3419     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3420     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3421     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3422 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3423     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3424                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3425                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3426                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3427     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3428     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3429     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3430                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3431                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3432                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3433     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3434     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3435     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3436 #else
3437     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3438     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3439                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3440                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3441                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3442     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3443     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3444 #endif
3445     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3446     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3447     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3448     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3449   } else if (str == SAME_NONZERO_PATTERN) {
3450     cublasHandle_t cublasv2handle;
3451     cublasStatus_t berr;
3452     PetscBLASInt   one = 1, bnz = 1;
3453 
3454     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3455     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3456     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3457     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3458     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3459     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3460     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3461     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3462     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3463     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3464     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3465   } else {
3466     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3467     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3468   }
3469   PetscFunctionReturn(0);
3470 }
3471 
3472 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3473 {
3474   PetscErrorCode ierr;
3475   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3476   PetscScalar    *ay;
3477   cublasHandle_t cublasv2handle;
3478   cublasStatus_t berr;
3479   PetscBLASInt   one = 1, bnz = 1;
3480 
3481   PetscFunctionBegin;
3482   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3483   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3484   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3485   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3486   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3487   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3488   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3489   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3490   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3491   PetscFunctionReturn(0);
3492 }
3493 
3494 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3495 {
3496   PetscErrorCode ierr;
3497   PetscBool      both = PETSC_FALSE;
3498   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3499 
3500   PetscFunctionBegin;
3501   if (A->factortype == MAT_FACTOR_NONE) {
3502     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3503     if (spptr->mat) {
3504       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3505       if (matrix->values) {
3506         both = PETSC_TRUE;
3507         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3508       }
3509     }
3510     if (spptr->matTranspose) {
3511       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3512       if (matrix->values) {
3513         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3514       }
3515     }
3516   }
3517   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3518   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3519   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3520   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3521   else A->offloadmask = PETSC_OFFLOAD_CPU;
3522   PetscFunctionReturn(0);
3523 }
3524 
3525 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3526 {
3527   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3528   PetscErrorCode ierr;
3529 
3530   PetscFunctionBegin;
3531   if (A->factortype != MAT_FACTOR_NONE) {
3532     A->boundtocpu = flg;
3533     PetscFunctionReturn(0);
3534   }
3535   if (flg) {
3536     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3537 
3538     A->ops->scale                     = MatScale_SeqAIJ;
3539     A->ops->axpy                      = MatAXPY_SeqAIJ;
3540     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3541     A->ops->mult                      = MatMult_SeqAIJ;
3542     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3543     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3544     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3545     A->ops->multhermitiantranspose    = NULL;
3546     A->ops->multhermitiantransposeadd = NULL;
3547     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3548     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3549     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3550     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3551     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3552     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3553     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3554     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3555     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3556   } else {
3557     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3558     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3559     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3560     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3561     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3562     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3563     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3564     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3565     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3566     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3567     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3568     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3569     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3570     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3571     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3572     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3573     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3574     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579   }
3580   A->boundtocpu = flg;
3581   if (flg && a->inode.size) {
3582     a->inode.use = PETSC_TRUE;
3583   } else {
3584     a->inode.use = PETSC_FALSE;
3585   }
3586   PetscFunctionReturn(0);
3587 }
3588 
3589 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3590 {
3591   PetscErrorCode   ierr;
3592   cusparseStatus_t stat;
3593   Mat              B;
3594 
3595   PetscFunctionBegin;
3596   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3597   if (reuse == MAT_INITIAL_MATRIX) {
3598     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3599   } else if (reuse == MAT_REUSE_MATRIX) {
3600     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3601   }
3602   B = *newmat;
3603 
3604   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3605   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3606 
3607   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3608     if (B->factortype == MAT_FACTOR_NONE) {
3609       Mat_SeqAIJCUSPARSE *spptr;
3610       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3611       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3612       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3613       spptr->format     = MAT_CUSPARSE_CSR;
3614      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3615      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3616       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3617      #else
3618       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3619      #endif
3620       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3621       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3622      #endif
3623       B->spptr = spptr;
3624     } else {
3625       Mat_SeqAIJCUSPARSETriFactors *spptr;
3626 
3627       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3628       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3629       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3630       B->spptr = spptr;
3631     }
3632     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3633   }
3634   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3635   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3636   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3637   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3638   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3639   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3640 
3641   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3642   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3643   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3644 #if defined(PETSC_HAVE_HYPRE)
3645   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3646 #endif
3647   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3648   PetscFunctionReturn(0);
3649 }
3650 
3651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3652 {
3653   PetscErrorCode ierr;
3654 
3655   PetscFunctionBegin;
3656   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3657   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3658   PetscFunctionReturn(0);
3659 }
3660 
3661 /*MC
3662    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3663 
3664    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3665    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3666    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3667 
3668    Options Database Keys:
3669 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3670 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3671 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3672 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3673 
3674   Level: beginner
3675 
3676 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3677 M*/
3678 
3679 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3680 
3681 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3682 {
3683   PetscErrorCode ierr;
3684 
3685   PetscFunctionBegin;
3686   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3687   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3688   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3689   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3690   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3691 
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3696 {
3697   PetscErrorCode   ierr;
3698   cusparseStatus_t stat;
3699 
3700   PetscFunctionBegin;
3701   if (*cusparsestruct) {
3702     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3703     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3704     delete (*cusparsestruct)->workVector;
3705     delete (*cusparsestruct)->rowoffsets_gpu;
3706     delete (*cusparsestruct)->cooPerm;
3707     delete (*cusparsestruct)->cooPerm_a;
3708     delete (*cusparsestruct)->csr2csc_i;
3709     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3710     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3711   }
3712   PetscFunctionReturn(0);
3713 }
3714 
3715 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3716 {
3717   PetscFunctionBegin;
3718   if (*mat) {
3719     delete (*mat)->values;
3720     delete (*mat)->column_indices;
3721     delete (*mat)->row_offsets;
3722     delete *mat;
3723     *mat = 0;
3724   }
3725   PetscFunctionReturn(0);
3726 }
3727 
3728 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3729 {
3730   cusparseStatus_t stat;
3731   PetscErrorCode   ierr;
3732 
3733   PetscFunctionBegin;
3734   if (*trifactor) {
3735     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3736     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3737     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3738     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3739     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3740    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3741     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3742    #endif
3743     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3744   }
3745   PetscFunctionReturn(0);
3746 }
3747 
3748 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3749 {
3750   CsrMatrix        *mat;
3751   cusparseStatus_t stat;
3752   cudaError_t      err;
3753 
3754   PetscFunctionBegin;
3755   if (*matstruct) {
3756     if ((*matstruct)->mat) {
3757       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3758        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3759         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3760        #else
3761         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3762         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3763        #endif
3764       } else {
3765         mat = (CsrMatrix*)(*matstruct)->mat;
3766         CsrMatrix_Destroy(&mat);
3767       }
3768     }
3769     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3770     delete (*matstruct)->cprowIndices;
3771     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3772     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3773     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3774 
3775    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3776     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3777     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3778     for (int i=0; i<3; i++) {
3779       if (mdata->cuSpMV[i].initialized) {
3780         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3781         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3782         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3783       }
3784     }
3785    #endif
3786     delete *matstruct;
3787     *matstruct = NULL;
3788   }
3789   PetscFunctionReturn(0);
3790 }
3791 
3792 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3793 {
3794   PetscErrorCode ierr;
3795 
3796   PetscFunctionBegin;
3797   if (*trifactors) {
3798     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3799     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3800     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3801     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3802     delete (*trifactors)->rpermIndices;
3803     delete (*trifactors)->cpermIndices;
3804     delete (*trifactors)->workVector;
3805     (*trifactors)->rpermIndices = NULL;
3806     (*trifactors)->cpermIndices = NULL;
3807     (*trifactors)->workVector = NULL;
3808     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3809     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3810     (*trifactors)->init_dev_prop = PETSC_FALSE;
3811   }
3812   PetscFunctionReturn(0);
3813 }
3814 
3815 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3816 {
3817   PetscErrorCode   ierr;
3818   cusparseHandle_t handle;
3819   cusparseStatus_t stat;
3820 
3821   PetscFunctionBegin;
3822   if (*trifactors) {
3823     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3824     if (handle = (*trifactors)->handle) {
3825       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3826     }
3827     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3828   }
3829   PetscFunctionReturn(0);
3830 }
3831 
3832 struct IJCompare
3833 {
3834   __host__ __device__
3835   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3836   {
3837     if (t1.get<0>() < t2.get<0>()) return true;
3838     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3839     return false;
3840   }
3841 };
3842 
3843 struct IJEqual
3844 {
3845   __host__ __device__
3846   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3847   {
3848     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3849     return true;
3850   }
3851 };
3852 
3853 struct IJDiff
3854 {
3855   __host__ __device__
3856   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3857   {
3858     return t1 == t2 ? 0 : 1;
3859   }
3860 };
3861 
3862 struct IJSum
3863 {
3864   __host__ __device__
3865   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3866   {
3867     return t1||t2;
3868   }
3869 };
3870 
3871 #include <thrust/iterator/discard_iterator.h>
3872 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3873 {
3874   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3875   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3876   THRUSTARRAY                           *cooPerm_v = NULL;
3877   thrust::device_ptr<const PetscScalar> d_v;
3878   CsrMatrix                             *matrix;
3879   PetscErrorCode                        ierr;
3880   PetscInt                              n;
3881 
3882   PetscFunctionBegin;
3883   PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3884   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3885   if (!cusp->cooPerm) {
3886     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3887     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3888     PetscFunctionReturn(0);
3889   }
3890   matrix = (CsrMatrix*)cusp->mat->mat;
3891   PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3892   if (!v) {
3893     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3894     goto finalize;
3895   }
3896   n = cusp->cooPerm->size();
3897   if (isCudaMem(v)) {
3898     d_v = thrust::device_pointer_cast(v);
3899   } else {
3900     cooPerm_v = new THRUSTARRAY(n);
3901     cooPerm_v->assign(v,v+n);
3902     d_v = cooPerm_v->data();
3903     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3904   }
3905   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3906   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3907     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3908       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3909       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3910       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3911         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3912         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3913       */
3914       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3915       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3916       delete cooPerm_w;
3917     } else {
3918       /* all nonzeros in d_v[] are unique entries */
3919       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3920                                                                 matrix->values->begin()));
3921       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3922                                                                 matrix->values->end()));
3923       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3924     }
3925   } else {
3926     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3927       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3928       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3929     } else {
3930       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3931                                                                 matrix->values->begin()));
3932       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3933                                                                 matrix->values->end()));
3934       thrust::for_each(zibit,zieit,VecCUDAEquals());
3935     }
3936   }
3937   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3938 finalize:
3939   delete cooPerm_v;
3940   A->offloadmask = PETSC_OFFLOAD_GPU;
3941   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3942   /* shorter version of MatAssemblyEnd_SeqAIJ */
3943   ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3944   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3945   ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3946   a->reallocs         = 0;
3947   A->info.mallocs    += 0;
3948   A->info.nz_unneeded = 0;
3949   A->assembled = A->was_assembled = PETSC_TRUE;
3950   A->num_ass++;
3951   PetscFunctionReturn(0);
3952 }
3953 
3954 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3955 {
3956   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3957   PetscErrorCode     ierr;
3958 
3959   PetscFunctionBegin;
3960   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3961   if (!cusp) PetscFunctionReturn(0);
3962   if (destroy) {
3963     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3964     delete cusp->csr2csc_i;
3965     cusp->csr2csc_i = NULL;
3966   }
3967   A->transupdated = PETSC_FALSE;
3968   PetscFunctionReturn(0);
3969 }
3970 
3971 #include <thrust/binary_search.h>
3972 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3973 {
3974   PetscErrorCode     ierr;
3975   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3976   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3977   PetscInt           cooPerm_n, nzr = 0;
3978   cudaError_t        cerr;
3979 
3980   PetscFunctionBegin;
3981   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3982   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3983   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3984   if (n != cooPerm_n) {
3985     delete cusp->cooPerm;
3986     delete cusp->cooPerm_a;
3987     cusp->cooPerm = NULL;
3988     cusp->cooPerm_a = NULL;
3989   }
3990   if (n) {
3991     THRUSTINTARRAY d_i(n);
3992     THRUSTINTARRAY d_j(n);
3993     THRUSTINTARRAY ii(A->rmap->n);
3994 
3995     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3996     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3997 
3998     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3999     d_i.assign(coo_i,coo_i+n);
4000     d_j.assign(coo_j,coo_j+n);
4001 
4002     /* Ex.
4003       n = 6
4004       coo_i = [3,3,1,4,1,4]
4005       coo_j = [3,2,2,5,2,6]
4006     */
4007     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4008     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4009 
4010     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4011     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4012     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4013     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4014     THRUSTINTARRAY w = d_j;
4015 
4016     /*
4017       d_i     = [1,1,3,3,4,4]
4018       d_j     = [2,2,2,3,5,6]
4019       cooPerm = [2,4,1,0,3,5]
4020     */
4021     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4022 
4023     /*
4024       d_i     = [1,3,3,4,4,x]
4025                             ^ekey
4026       d_j     = [2,2,3,5,6,x]
4027                            ^nekye
4028     */
4029     if (nekey == ekey) { /* all entries are unique */
4030       delete cusp->cooPerm_a;
4031       cusp->cooPerm_a = NULL;
4032     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4033       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4034       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4035       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4036       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4037       w[0] = 0;
4038       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4039       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4040     }
4041     thrust::counting_iterator<PetscInt> search_begin(0);
4042     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4043                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4044                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4045     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4046 
4047     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4048     a->singlemalloc = PETSC_FALSE;
4049     a->free_a       = PETSC_TRUE;
4050     a->free_ij      = PETSC_TRUE;
4051     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4052     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4053     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4054     a->nz = a->maxnz = a->i[A->rmap->n];
4055     a->rmax = 0;
4056     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4057     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4058     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4059     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4060     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4061     for (PetscInt i = 0; i < A->rmap->n; i++) {
4062       const PetscInt nnzr = a->i[i+1] - a->i[i];
4063       nzr += (PetscInt)!!(nnzr);
4064       a->ilen[i] = a->imax[i] = nnzr;
4065       a->rmax = PetscMax(a->rmax,nnzr);
4066     }
4067     a->nonzerorowcnt = nzr;
4068     A->preallocated = PETSC_TRUE;
4069     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4070     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4071   } else {
4072     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4073   }
4074   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4075 
4076   /* We want to allocate the CUSPARSE struct for matvec now.
4077      The code is so convoluted now that I prefer to copy zeros */
4078   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4079   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4080   A->offloadmask = PETSC_OFFLOAD_CPU;
4081   A->nonzerostate++;
4082   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4083   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4084 
4085   A->assembled = PETSC_FALSE;
4086   A->was_assembled = PETSC_FALSE;
4087   PetscFunctionReturn(0);
4088 }
4089 
4090 /*@C
4091     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4092 
4093    Not collective
4094 
4095     Input Parameters:
4096 +   A - the matrix
4097 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4098 
4099     Output Parameters:
4100 +   ia - the CSR row pointers
4101 -   ja - the CSR column indices
4102 
4103     Level: developer
4104 
4105     Notes:
4106       When compressed is true, the CSR structure does not contain empty rows
4107 
4108 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4109 @*/
4110 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4111 {
4112   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4113   CsrMatrix          *csr;
4114   PetscErrorCode     ierr;
4115   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4116 
4117   PetscFunctionBegin;
4118   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4119   if (!i || !j) PetscFunctionReturn(0);
4120   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4121   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4122   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4123   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4124   csr = (CsrMatrix*)cusp->mat->mat;
4125   if (i) {
4126     if (!compressed && a->compressedrow.use) { /* need full row offset */
4127       if (!cusp->rowoffsets_gpu) {
4128         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4129         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4130         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4131       }
4132       *i = cusp->rowoffsets_gpu->data().get();
4133     } else *i = csr->row_offsets->data().get();
4134   }
4135   if (j) *j = csr->column_indices->data().get();
4136   PetscFunctionReturn(0);
4137 }
4138 
4139 /*@C
4140     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4141 
4142    Not collective
4143 
4144     Input Parameters:
4145 +   A - the matrix
4146 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4147 
4148     Output Parameters:
4149 +   ia - the CSR row pointers
4150 -   ja - the CSR column indices
4151 
4152     Level: developer
4153 
4154 .seealso: MatSeqAIJCUSPARSEGetIJ()
4155 @*/
4156 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4157 {
4158   PetscFunctionBegin;
4159   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4160   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4161   if (i) *i = NULL;
4162   if (j) *j = NULL;
4163   PetscFunctionReturn(0);
4164 }
4165 
4166 /*@C
4167    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4168 
4169    Not Collective
4170 
4171    Input Parameter:
4172 .   A - a MATSEQAIJCUSPARSE matrix
4173 
4174    Output Parameter:
4175 .   a - pointer to the device data
4176 
4177    Level: developer
4178 
4179    Notes: may trigger host-device copies if up-to-date matrix data is on host
4180 
4181 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4182 @*/
4183 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4184 {
4185   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4186   CsrMatrix          *csr;
4187   PetscErrorCode     ierr;
4188 
4189   PetscFunctionBegin;
4190   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4191   PetscValidPointer(a,2);
4192   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4193   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4194   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4195   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4196   csr = (CsrMatrix*)cusp->mat->mat;
4197   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4198   *a = csr->values->data().get();
4199   PetscFunctionReturn(0);
4200 }
4201 
4202 /*@C
4203    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4204 
4205    Not Collective
4206 
4207    Input Parameter:
4208 .   A - a MATSEQAIJCUSPARSE matrix
4209 
4210    Output Parameter:
4211 .   a - pointer to the device data
4212 
4213    Level: developer
4214 
4215 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4216 @*/
4217 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4218 {
4219   PetscFunctionBegin;
4220   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4221   PetscValidPointer(a,2);
4222   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4223   *a = NULL;
4224   PetscFunctionReturn(0);
4225 }
4226 
4227 /*@C
4228    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4229 
4230    Not Collective
4231 
4232    Input Parameter:
4233 .   A - a MATSEQAIJCUSPARSE matrix
4234 
4235    Output Parameter:
4236 .   a - pointer to the device data
4237 
4238    Level: developer
4239 
4240    Notes: may trigger host-device copies if up-to-date matrix data is on host
4241 
4242 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4243 @*/
4244 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4245 {
4246   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4247   CsrMatrix          *csr;
4248   PetscErrorCode     ierr;
4249 
4250   PetscFunctionBegin;
4251   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4252   PetscValidPointer(a,2);
4253   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4254   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4255   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4256   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4257   csr = (CsrMatrix*)cusp->mat->mat;
4258   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4259   *a = csr->values->data().get();
4260   A->offloadmask = PETSC_OFFLOAD_GPU;
4261   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4262   PetscFunctionReturn(0);
4263 }
4264 /*@C
4265    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4266 
4267    Not Collective
4268 
4269    Input Parameter:
4270 .   A - a MATSEQAIJCUSPARSE matrix
4271 
4272    Output Parameter:
4273 .   a - pointer to the device data
4274 
4275    Level: developer
4276 
4277 .seealso: MatSeqAIJCUSPARSEGetArray()
4278 @*/
4279 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4280 {
4281   PetscErrorCode ierr;
4282 
4283   PetscFunctionBegin;
4284   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4285   PetscValidPointer(a,2);
4286   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4287   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4288   *a = NULL;
4289   PetscFunctionReturn(0);
4290 }
4291 
4292 /*@C
4293    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4294 
4295    Not Collective
4296 
4297    Input Parameter:
4298 .   A - a MATSEQAIJCUSPARSE matrix
4299 
4300    Output Parameter:
4301 .   a - pointer to the device data
4302 
4303    Level: developer
4304 
4305    Notes: does not trigger host-device copies and flags data validity on the GPU
4306 
4307 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4308 @*/
4309 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4310 {
4311   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4312   CsrMatrix          *csr;
4313   PetscErrorCode     ierr;
4314 
4315   PetscFunctionBegin;
4316   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4317   PetscValidPointer(a,2);
4318   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4319   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4320   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4321   csr = (CsrMatrix*)cusp->mat->mat;
4322   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4323   *a = csr->values->data().get();
4324   A->offloadmask = PETSC_OFFLOAD_GPU;
4325   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4326   PetscFunctionReturn(0);
4327 }
4328 
4329 /*@C
4330    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4331 
4332    Not Collective
4333 
4334    Input Parameter:
4335 .   A - a MATSEQAIJCUSPARSE matrix
4336 
4337    Output Parameter:
4338 .   a - pointer to the device data
4339 
4340    Level: developer
4341 
4342 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4343 @*/
4344 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4345 {
4346   PetscErrorCode ierr;
4347 
4348   PetscFunctionBegin;
4349   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4350   PetscValidPointer(a,2);
4351   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4352   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4353   *a = NULL;
4354   PetscFunctionReturn(0);
4355 }
4356 
4357 struct IJCompare4
4358 {
4359   __host__ __device__
4360   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4361   {
4362     if (t1.get<0>() < t2.get<0>()) return true;
4363     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4364     return false;
4365   }
4366 };
4367 
4368 struct Shift
4369 {
4370   int _shift;
4371 
4372   Shift(int shift) : _shift(shift) {}
4373   __host__ __device__
4374   inline int operator() (const int &c)
4375   {
4376     return c + _shift;
4377   }
4378 };
4379 
4380 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4381 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4382 {
4383   PetscErrorCode               ierr;
4384   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4385   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4386   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4387   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4388   PetscInt                     Annz,Bnnz;
4389   cusparseStatus_t             stat;
4390   PetscInt                     i,m,n,zero = 0;
4391   cudaError_t                  cerr;
4392 
4393   PetscFunctionBegin;
4394   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4395   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4396   PetscValidPointer(C,4);
4397   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4398   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4399   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4400   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4401   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4402   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4403   if (reuse == MAT_INITIAL_MATRIX) {
4404     m     = A->rmap->n;
4405     n     = A->cmap->n + B->cmap->n;
4406     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4407     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4408     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4409     c     = (Mat_SeqAIJ*)(*C)->data;
4410     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4411     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4412     Ccsr  = new CsrMatrix;
4413     Cmat->cprowIndices      = NULL;
4414     c->compressedrow.use    = PETSC_FALSE;
4415     c->compressedrow.nrows  = 0;
4416     c->compressedrow.i      = NULL;
4417     c->compressedrow.rindex = NULL;
4418     Ccusp->workVector       = NULL;
4419     Ccusp->nrows    = m;
4420     Ccusp->mat      = Cmat;
4421     Ccusp->mat->mat = Ccsr;
4422     Ccsr->num_rows  = m;
4423     Ccsr->num_cols  = n;
4424     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4425     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4426     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4427     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4428     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4429     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4430     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4431     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4433     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4434     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4435     PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4436     PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4437 
4438     Acsr = (CsrMatrix*)Acusp->mat->mat;
4439     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4440     Annz = (PetscInt)Acsr->column_indices->size();
4441     Bnnz = (PetscInt)Bcsr->column_indices->size();
4442     c->nz = Annz + Bnnz;
4443     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4444     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4445     Ccsr->values = new THRUSTARRAY(c->nz);
4446     Ccsr->num_entries = c->nz;
4447     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4448     if (c->nz) {
4449       auto Acoo = new THRUSTINTARRAY32(Annz);
4450       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4451       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4452       THRUSTINTARRAY32 *Aroff,*Broff;
4453 
4454       if (a->compressedrow.use) { /* need full row offset */
4455         if (!Acusp->rowoffsets_gpu) {
4456           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4457           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4458           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4459         }
4460         Aroff = Acusp->rowoffsets_gpu;
4461       } else Aroff = Acsr->row_offsets;
4462       if (b->compressedrow.use) { /* need full row offset */
4463         if (!Bcusp->rowoffsets_gpu) {
4464           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4465           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4466           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4467         }
4468         Broff = Bcusp->rowoffsets_gpu;
4469       } else Broff = Bcsr->row_offsets;
4470       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4471       stat = cusparseXcsr2coo(Acusp->handle,
4472                               Aroff->data().get(),
4473                               Annz,
4474                               m,
4475                               Acoo->data().get(),
4476                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4477       stat = cusparseXcsr2coo(Bcusp->handle,
4478                               Broff->data().get(),
4479                               Bnnz,
4480                               m,
4481                               Bcoo->data().get(),
4482                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4483       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4484       auto Aperm = thrust::make_constant_iterator(1);
4485       auto Bperm = thrust::make_constant_iterator(0);
4486 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4487       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4488       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4489 #else
4490       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4491       auto Bcib = Bcsr->column_indices->begin();
4492       auto Bcie = Bcsr->column_indices->end();
4493       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4494 #endif
4495       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4496       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4497       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4498       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4499       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4500       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4501       auto p1 = Ccusp->cooPerm->begin();
4502       auto p2 = Ccusp->cooPerm->begin();
4503       thrust::advance(p2,Annz);
4504       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4505 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4506       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4507 #endif
4508       auto cci = thrust::make_counting_iterator(zero);
4509       auto cce = thrust::make_counting_iterator(c->nz);
4510 #if 0 //Errors on SUMMIT cuda 11.1.0
4511       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4512 #else
4513       auto pred = thrust::identity<int>();
4514       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4515       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4516 #endif
4517       stat = cusparseXcoo2csr(Ccusp->handle,
4518                               Ccoo->data().get(),
4519                               c->nz,
4520                               m,
4521                               Ccsr->row_offsets->data().get(),
4522                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4523       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4524       delete wPerm;
4525       delete Acoo;
4526       delete Bcoo;
4527       delete Ccoo;
4528 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4529       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4530                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4531                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4532                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4533 #endif
4534       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4535         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4536         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4537         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4538         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4539         CsrMatrix *CcsrT = new CsrMatrix;
4540         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4541         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4542 
4543         (*C)->form_explicit_transpose = PETSC_TRUE;
4544         (*C)->transupdated = PETSC_TRUE;
4545         Ccusp->rowoffsets_gpu = NULL;
4546         CmatT->cprowIndices = NULL;
4547         CmatT->mat = CcsrT;
4548         CcsrT->num_rows = n;
4549         CcsrT->num_cols = m;
4550         CcsrT->num_entries = c->nz;
4551 
4552         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4553         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4554         CcsrT->values = new THRUSTARRAY(c->nz);
4555 
4556         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4557         auto rT = CcsrT->row_offsets->begin();
4558         if (AT) {
4559           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4560           thrust::advance(rT,-1);
4561         }
4562         if (BT) {
4563           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4564           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4565           thrust::copy(titb,tite,rT);
4566         }
4567         auto cT = CcsrT->column_indices->begin();
4568         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4569         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4570         auto vT = CcsrT->values->begin();
4571         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4572         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4573         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4574 
4575         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4576         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4577         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4578         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4579         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4580         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4581         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4582         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4584 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4585         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4586                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4587                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4588                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4589 #endif
4590         Ccusp->matTranspose = CmatT;
4591       }
4592     }
4593 
4594     c->singlemalloc = PETSC_FALSE;
4595     c->free_a       = PETSC_TRUE;
4596     c->free_ij      = PETSC_TRUE;
4597     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4598     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4599     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4600       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4601       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4602       ii   = *Ccsr->row_offsets;
4603       jj   = *Ccsr->column_indices;
4604       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4605       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606     } else {
4607       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4608       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609     }
4610     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4611     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4612     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4613     c->maxnz = c->nz;
4614     c->nonzerorowcnt = 0;
4615     c->rmax = 0;
4616     for (i = 0; i < m; i++) {
4617       const PetscInt nn = c->i[i+1] - c->i[i];
4618       c->ilen[i] = c->imax[i] = nn;
4619       c->nonzerorowcnt += (PetscInt)!!nn;
4620       c->rmax = PetscMax(c->rmax,nn);
4621     }
4622     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4623     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4624     (*C)->nonzerostate++;
4625     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4626     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4627     Ccusp->nonzerostate = (*C)->nonzerostate;
4628     (*C)->preallocated  = PETSC_TRUE;
4629   } else {
4630     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4631     c = (Mat_SeqAIJ*)(*C)->data;
4632     if (c->nz) {
4633       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4634       PetscCheckFalse(!Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4635       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4636       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4637       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4638       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4639       PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4640       PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4641       Acsr = (CsrMatrix*)Acusp->mat->mat;
4642       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4643       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4644       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4645       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4646       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4647       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4648       PetscCheckFalse(Ccusp->cooPerm->size() != Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4649       auto pmid = Ccusp->cooPerm->begin();
4650       thrust::advance(pmid,Acsr->num_entries);
4651       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4652       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4653                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4654       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4655                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4656       thrust::for_each(zibait,zieait,VecCUDAEquals());
4657       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4658                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4659       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4660                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4661       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4662       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4663       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4664         PetscCheckFalse(!Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4665         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4666         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4667         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4668         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4669         auto vT = CcsrT->values->begin();
4670         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4671         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4672         (*C)->transupdated = PETSC_TRUE;
4673       }
4674       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4675     }
4676   }
4677   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4678   (*C)->assembled     = PETSC_TRUE;
4679   (*C)->was_assembled = PETSC_FALSE;
4680   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4681   PetscFunctionReturn(0);
4682 }
4683 
4684 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4685 {
4686   PetscErrorCode    ierr;
4687   bool              dmem;
4688   const PetscScalar *av;
4689   cudaError_t       cerr;
4690 
4691   PetscFunctionBegin;
4692   dmem = isCudaMem(v);
4693   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4694   if (n && idx) {
4695     THRUSTINTARRAY widx(n);
4696     widx.assign(idx,idx+n);
4697     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4698 
4699     THRUSTARRAY *w = NULL;
4700     thrust::device_ptr<PetscScalar> dv;
4701     if (dmem) {
4702       dv = thrust::device_pointer_cast(v);
4703     } else {
4704       w = new THRUSTARRAY(n);
4705       dv = w->data();
4706     }
4707     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4708 
4709     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4710     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4711     thrust::for_each(zibit,zieit,VecCUDAEquals());
4712     if (w) {
4713       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4714     }
4715     delete w;
4716   } else {
4717     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4718   }
4719   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4720   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4721   PetscFunctionReturn(0);
4722 }
4723