xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a2cee5feeb0a7056e6e3aa42b48b2df494100525)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
91 
92 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
93 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
96 
97 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
98 {
99   cusparseStatus_t   stat;
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101 
102   PetscFunctionBegin;
103   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104   cusparsestruct->stream = stream;
105   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
110 {
111   cusparseStatus_t   stat;
112   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
113 
114   PetscFunctionBegin;
115   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
116   if (cusparsestruct->handle != handle) {
117     if (cusparsestruct->handle) {
118       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
119     }
120     cusparsestruct->handle = handle;
121   }
122   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
123   PetscFunctionReturn(0);
124 }
125 
126 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
127 {
128   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
129   PetscBool          flg;
130   PetscErrorCode     ierr;
131 
132   PetscFunctionBegin;
133   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
134   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
135   if (cusparsestruct->handle) cusparsestruct->handle = 0;
136   PetscFunctionReturn(0);
137 }
138 
139 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
140 {
141   PetscFunctionBegin;
142   *type = MATSOLVERCUSPARSE;
143   PetscFunctionReturn(0);
144 }
145 
146 /*MC
147   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
148   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
149   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
150   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
151   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
152   algorithms are not recommended. This class does NOT support direct solver operations.
153 
154   Level: beginner
155 
156 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157 M*/
158 
159 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
160 {
161   PetscErrorCode ierr;
162   PetscInt       n = A->rmap->n;
163 
164   PetscFunctionBegin;
165   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
166   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
167   (*B)->factortype = ftype;
168   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
169 
170   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
171   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
172     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
173     if (!A->boundtocpu) {
174       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176     } else {
177       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
178       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
179     }
180     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
181     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
182     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
183   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
184     if (!A->boundtocpu) {
185       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
186       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
187     } else {
188       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
189       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
190     }
191     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
192     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
193   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
194 
195   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
196   (*B)->canuseordering = PETSC_TRUE;
197   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
202 {
203   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
204 
205   PetscFunctionBegin;
206   switch (op) {
207   case MAT_CUSPARSE_MULT:
208     cusparsestruct->format = format;
209     break;
210   case MAT_CUSPARSE_ALL:
211     cusparsestruct->format = format;
212     break;
213   default:
214     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
215   }
216   PetscFunctionReturn(0);
217 }
218 
219 /*@
220    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
221    operation. Only the MatMult operation can use different GPU storage formats
222    for MPIAIJCUSPARSE matrices.
223    Not Collective
224 
225    Input Parameters:
226 +  A - Matrix of type SEQAIJCUSPARSE
227 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
228 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
229 
230    Output Parameter:
231 
232    Level: intermediate
233 
234 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
235 @*/
236 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
237 {
238   PetscErrorCode ierr;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
242   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
247 {
248   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
249 
250   PetscFunctionBegin;
251   cusparsestruct->use_cpu_solve = use_cpu;
252   PetscFunctionReturn(0);
253 }
254 
255 /*@
256    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
257 
258    Input Parameters:
259 +  A - Matrix of type SEQAIJCUSPARSE
260 -  use_cpu - set flag for using the built-in CPU MatSolve
261 
262    Output Parameter:
263 
264    Notes:
265    The cuSparse LU solver currently computes the factors with the built-in CPU method
266    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
267    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
268 
269    Level: intermediate
270 
271 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
272 @*/
273 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
274 {
275   PetscErrorCode ierr;
276 
277   PetscFunctionBegin;
278   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
279   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
280   PetscFunctionReturn(0);
281 }
282 
283 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
284 {
285   PetscErrorCode ierr;
286 
287   PetscFunctionBegin;
288   switch (op) {
289     case MAT_FORM_EXPLICIT_TRANSPOSE:
290       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
291       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
292       A->form_explicit_transpose = flg;
293       break;
294     default:
295       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
296       break;
297   }
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
302 
303 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
304 {
305   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
306   IS             isrow = b->row,iscol = b->col;
307   PetscBool      row_identity,col_identity;
308   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
313   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
314   B->offloadmask = PETSC_OFFLOAD_CPU;
315   /* determine which version of MatSolve needs to be used. */
316   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
317   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
318   if (row_identity && col_identity) {
319     if (!cusparsestruct->use_cpu_solve) {
320       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
321       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
322     }
323     B->ops->matsolve = NULL;
324     B->ops->matsolvetranspose = NULL;
325   } else {
326     if (!cusparsestruct->use_cpu_solve) {
327       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
328       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
329     }
330     B->ops->matsolve = NULL;
331     B->ops->matsolvetranspose = NULL;
332   }
333 
334   /* get the triangular factors */
335   if (!cusparsestruct->use_cpu_solve) {
336     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
337   }
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
342 {
343   PetscErrorCode           ierr;
344   MatCUSPARSEStorageFormat format;
345   PetscBool                flg;
346   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
347 
348   PetscFunctionBegin;
349   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
350   if (A->factortype == MAT_FACTOR_NONE) {
351     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
352                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
353     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
354 
355     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
356                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
357     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
358     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
359     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
361     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
362                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
363     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
364 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
365     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
366 #else
367     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
368 #endif
369     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
370                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
371     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
372 
373     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
374                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
375     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
376    #endif
377   }
378   ierr = PetscOptionsTail();CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
383 {
384   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
385   PetscErrorCode               ierr;
386 
387   PetscFunctionBegin;
388   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
389   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
390   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
391   PetscFunctionReturn(0);
392 }
393 
394 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
395 {
396   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
397   PetscErrorCode               ierr;
398 
399   PetscFunctionBegin;
400   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
401   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
402   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
403   PetscFunctionReturn(0);
404 }
405 
406 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
407 {
408   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
409   PetscErrorCode               ierr;
410 
411   PetscFunctionBegin;
412   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
413   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
414   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
415   PetscFunctionReturn(0);
416 }
417 
418 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
419 {
420   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
421   PetscErrorCode               ierr;
422 
423   PetscFunctionBegin;
424   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
425   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
426   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
427   PetscFunctionReturn(0);
428 }
429 
430 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
431 {
432   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
433   PetscInt                          n = A->rmap->n;
434   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
435   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
436   cusparseStatus_t                  stat;
437   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
438   const MatScalar                   *aa = a->a,*v;
439   PetscInt                          *AiLo, *AjLo;
440   PetscInt                          i,nz, nzLower, offset, rowOffset;
441   PetscErrorCode                    ierr;
442   cudaError_t                       cerr;
443 
444   PetscFunctionBegin;
445   if (!n) PetscFunctionReturn(0);
446   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
447     try {
448       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
449       nzLower=n+ai[n]-ai[1];
450       if (!loTriFactor) {
451         PetscScalar                       *AALo;
452 
453         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
454 
455         /* Allocate Space for the lower triangular matrix */
456         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
457         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
458 
459         /* Fill the lower triangular matrix */
460         AiLo[0]  = (PetscInt) 0;
461         AiLo[n]  = nzLower;
462         AjLo[0]  = (PetscInt) 0;
463         AALo[0]  = (MatScalar) 1.0;
464         v        = aa;
465         vi       = aj;
466         offset   = 1;
467         rowOffset= 1;
468         for (i=1; i<n; i++) {
469           nz = ai[i+1] - ai[i];
470           /* additional 1 for the term on the diagonal */
471           AiLo[i]    = rowOffset;
472           rowOffset += nz+1;
473 
474           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
475           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
476 
477           offset      += nz;
478           AjLo[offset] = (PetscInt) i;
479           AALo[offset] = (MatScalar) 1.0;
480           offset      += 1;
481 
482           v  += nz;
483           vi += nz;
484         }
485 
486         /* allocate space for the triangular factor information */
487         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
488         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
489         /* Create the matrix description */
490         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
491         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
492        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
493         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
494        #else
495         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
496        #endif
497         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
498         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
499 
500         /* set the operation */
501         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502 
503         /* set the matrix */
504         loTriFactor->csrMat = new CsrMatrix;
505         loTriFactor->csrMat->num_rows = n;
506         loTriFactor->csrMat->num_cols = n;
507         loTriFactor->csrMat->num_entries = nzLower;
508 
509         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
510         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
511 
512         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
513         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
514 
515         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
516         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
517 
518         /* Create the solve analysis information */
519         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
520         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
521       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
522         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
523                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
524                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
525                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
526                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
527         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
528       #endif
529 
530         /* perform the solve analysis */
531         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
532                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
533                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
534                                  loTriFactor->csrMat->column_indices->data().get(),
535                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
536                                  loTriFactor->solveInfo,
537                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
538                                #else
539                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
540                                #endif
541         cerr = WaitForCUDA();CHKERRCUDA(cerr);
542         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
543 
544         /* assign the pointer */
545         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
546         loTriFactor->AA_h = AALo;
547         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
548         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
549         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
550       } else { /* update values only */
551         if (!loTriFactor->AA_h) {
552           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
553         }
554         /* Fill the lower triangular matrix */
555         loTriFactor->AA_h[0]  = 1.0;
556         v        = aa;
557         vi       = aj;
558         offset   = 1;
559         for (i=1; i<n; i++) {
560           nz = ai[i+1] - ai[i];
561           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
562           offset      += nz;
563           loTriFactor->AA_h[offset] = 1.0;
564           offset      += 1;
565           v  += nz;
566         }
567         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
568         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
569       }
570     } catch(char *ex) {
571       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
572     }
573   }
574   PetscFunctionReturn(0);
575 }
576 
577 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
578 {
579   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
580   PetscInt                          n = A->rmap->n;
581   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
582   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
583   cusparseStatus_t                  stat;
584   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
585   const MatScalar                   *aa = a->a,*v;
586   PetscInt                          *AiUp, *AjUp;
587   PetscInt                          i,nz, nzUpper, offset;
588   PetscErrorCode                    ierr;
589   cudaError_t                       cerr;
590 
591   PetscFunctionBegin;
592   if (!n) PetscFunctionReturn(0);
593   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
594     try {
595       /* next, figure out the number of nonzeros in the upper triangular matrix. */
596       nzUpper = adiag[0]-adiag[n];
597       if (!upTriFactor) {
598         PetscScalar *AAUp;
599 
600         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
601 
602         /* Allocate Space for the upper triangular matrix */
603         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
604         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
605 
606         /* Fill the upper triangular matrix */
607         AiUp[0]=(PetscInt) 0;
608         AiUp[n]=nzUpper;
609         offset = nzUpper;
610         for (i=n-1; i>=0; i--) {
611           v  = aa + adiag[i+1] + 1;
612           vi = aj + adiag[i+1] + 1;
613 
614           /* number of elements NOT on the diagonal */
615           nz = adiag[i] - adiag[i+1]-1;
616 
617           /* decrement the offset */
618           offset -= (nz+1);
619 
620           /* first, set the diagonal elements */
621           AjUp[offset] = (PetscInt) i;
622           AAUp[offset] = (MatScalar)1./v[nz];
623           AiUp[i]      = AiUp[i+1] - (nz+1);
624 
625           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
626           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
627         }
628 
629         /* allocate space for the triangular factor information */
630         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
631         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
632 
633         /* Create the matrix description */
634         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
635         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
636        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
637         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
638        #else
639         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
640        #endif
641         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
642         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
643 
644         /* set the operation */
645         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
646 
647         /* set the matrix */
648         upTriFactor->csrMat = new CsrMatrix;
649         upTriFactor->csrMat->num_rows = n;
650         upTriFactor->csrMat->num_cols = n;
651         upTriFactor->csrMat->num_entries = nzUpper;
652 
653         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
654         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
655 
656         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
657         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
658 
659         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
660         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
661 
662         /* Create the solve analysis information */
663         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
664         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
665       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
666         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
667                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
668                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
669                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
670                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
671         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
672       #endif
673 
674         /* perform the solve analysis */
675         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
676                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
677                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
678                                  upTriFactor->csrMat->column_indices->data().get(),
679                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
680                                  upTriFactor->solveInfo,
681                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
682                                #else
683                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
684                                #endif
685         cerr = WaitForCUDA();CHKERRCUDA(cerr);
686         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
687 
688         /* assign the pointer */
689         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
690         upTriFactor->AA_h = AAUp;
691         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
692         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
693         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
694       } else {
695         if (!upTriFactor->AA_h) {
696           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
697         }
698         /* Fill the upper triangular matrix */
699         offset = nzUpper;
700         for (i=n-1; i>=0; i--) {
701           v  = aa + adiag[i+1] + 1;
702 
703           /* number of elements NOT on the diagonal */
704           nz = adiag[i] - adiag[i+1]-1;
705 
706           /* decrement the offset */
707           offset -= (nz+1);
708 
709           /* first, set the diagonal elements */
710           upTriFactor->AA_h[offset] = 1./v[nz];
711           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
712         }
713         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
714         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
715       }
716     } catch(char *ex) {
717       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
718     }
719   }
720   PetscFunctionReturn(0);
721 }
722 
723 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
724 {
725   PetscErrorCode               ierr;
726   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
727   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
728   IS                           isrow = a->row,iscol = a->icol;
729   PetscBool                    row_identity,col_identity;
730   PetscInt                     n = A->rmap->n;
731 
732   PetscFunctionBegin;
733   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
734   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
735   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
736 
737   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
738   cusparseTriFactors->nnz=a->nz;
739 
740   A->offloadmask = PETSC_OFFLOAD_BOTH;
741   /* lower triangular indices */
742   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
743   if (!row_identity && !cusparseTriFactors->rpermIndices) {
744     const PetscInt *r;
745 
746     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
747     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
748     cusparseTriFactors->rpermIndices->assign(r, r+n);
749     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
750     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
751   }
752 
753   /* upper triangular indices */
754   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
755   if (!col_identity && !cusparseTriFactors->cpermIndices) {
756     const PetscInt *c;
757 
758     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
759     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
760     cusparseTriFactors->cpermIndices->assign(c, c+n);
761     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
762     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
763   }
764   PetscFunctionReturn(0);
765 }
766 
767 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
768 {
769   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
770   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
771   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
772   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
773   cusparseStatus_t                  stat;
774   PetscErrorCode                    ierr;
775   cudaError_t                       cerr;
776   PetscInt                          *AiUp, *AjUp;
777   PetscScalar                       *AAUp;
778   PetscScalar                       *AALo;
779   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
780   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
781   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
782   const MatScalar                   *aa = b->a,*v;
783 
784   PetscFunctionBegin;
785   if (!n) PetscFunctionReturn(0);
786   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
787     try {
788       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
789       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
790       if (!upTriFactor && !loTriFactor) {
791         /* Allocate Space for the upper triangular matrix */
792         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
793         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
794 
795         /* Fill the upper triangular matrix */
796         AiUp[0]=(PetscInt) 0;
797         AiUp[n]=nzUpper;
798         offset = 0;
799         for (i=0; i<n; i++) {
800           /* set the pointers */
801           v  = aa + ai[i];
802           vj = aj + ai[i];
803           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
804 
805           /* first, set the diagonal elements */
806           AjUp[offset] = (PetscInt) i;
807           AAUp[offset] = (MatScalar)1.0/v[nz];
808           AiUp[i]      = offset;
809           AALo[offset] = (MatScalar)1.0/v[nz];
810 
811           offset+=1;
812           if (nz>0) {
813             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
814             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
815             for (j=offset; j<offset+nz; j++) {
816               AAUp[j] = -AAUp[j];
817               AALo[j] = AAUp[j]/v[nz];
818             }
819             offset+=nz;
820           }
821         }
822 
823         /* allocate space for the triangular factor information */
824         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
825         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826 
827         /* Create the matrix description */
828         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
829         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832        #else
833         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834        #endif
835         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
837 
838         /* set the matrix */
839         upTriFactor->csrMat = new CsrMatrix;
840         upTriFactor->csrMat->num_rows = A->rmap->n;
841         upTriFactor->csrMat->num_cols = A->cmap->n;
842         upTriFactor->csrMat->num_entries = a->nz;
843 
844         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
852 
853         /* set the operation */
854         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
855 
856         /* Create the solve analysis information */
857         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
861                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
862                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
863                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
864                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866       #endif
867 
868         /* perform the solve analysis */
869         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
870                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
871                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
872                                  upTriFactor->csrMat->column_indices->data().get(),
873                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874                                  upTriFactor->solveInfo,
875                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876                                 #else
877                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878                                 #endif
879         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881 
882         /* assign the pointer */
883         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
884 
885         /* allocate space for the triangular factor information */
886         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
887         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
888 
889         /* Create the matrix description */
890         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
891         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
892        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
893         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
894        #else
895         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
896        #endif
897         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
898         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
899 
900         /* set the operation */
901         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
902 
903         /* set the matrix */
904         loTriFactor->csrMat = new CsrMatrix;
905         loTriFactor->csrMat->num_rows = A->rmap->n;
906         loTriFactor->csrMat->num_cols = A->cmap->n;
907         loTriFactor->csrMat->num_entries = a->nz;
908 
909         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
910         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
911 
912         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
913         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
914 
915         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
916         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
917 
918         /* Create the solve analysis information */
919         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
920         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
921       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
922         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
923                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
924                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
925                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
926                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
927         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
928       #endif
929 
930         /* perform the solve analysis */
931         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
932                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
933                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
934                                  loTriFactor->csrMat->column_indices->data().get(),
935                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
936                                  loTriFactor->solveInfo,
937                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
938                                 #else
939                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
940                                 #endif
941         cerr = WaitForCUDA();CHKERRCUDA(cerr);
942         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
943 
944         /* assign the pointer */
945         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
946 
947         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
948         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
949         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
950       } else {
951         /* Fill the upper triangular matrix */
952         offset = 0;
953         for (i=0; i<n; i++) {
954           /* set the pointers */
955           v  = aa + ai[i];
956           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
957 
958           /* first, set the diagonal elements */
959           AAUp[offset] = 1.0/v[nz];
960           AALo[offset] = 1.0/v[nz];
961 
962           offset+=1;
963           if (nz>0) {
964             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
965             for (j=offset; j<offset+nz; j++) {
966               AAUp[j] = -AAUp[j];
967               AALo[j] = AAUp[j]/v[nz];
968             }
969             offset+=nz;
970           }
971         }
972         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
973         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
974         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
975         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
976         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
977       }
978       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
979       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
980     } catch(char *ex) {
981       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
982     }
983   }
984   PetscFunctionReturn(0);
985 }
986 
987 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
988 {
989   PetscErrorCode               ierr;
990   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
991   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
992   IS                           ip = a->row;
993   PetscBool                    perm_identity;
994   PetscInt                     n = A->rmap->n;
995 
996   PetscFunctionBegin;
997   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
998   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
999   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
1000   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
1001 
1002   A->offloadmask = PETSC_OFFLOAD_BOTH;
1003 
1004   /* lower triangular indices */
1005   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1006   if (!perm_identity) {
1007     IS             iip;
1008     const PetscInt *irip,*rip;
1009 
1010     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1011     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1012     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1013     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1014     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1015     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1016     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1017     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1018     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1019     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1020     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1021   }
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1026 {
1027   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1028   IS             ip = b->row;
1029   PetscBool      perm_identity;
1030   PetscErrorCode ierr;
1031 
1032   PetscFunctionBegin;
1033   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1034   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1035   B->offloadmask = PETSC_OFFLOAD_CPU;
1036   /* determine which version of MatSolve needs to be used. */
1037   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1038   if (perm_identity) {
1039     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1040     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1041     B->ops->matsolve = NULL;
1042     B->ops->matsolvetranspose = NULL;
1043   } else {
1044     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1045     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1046     B->ops->matsolve = NULL;
1047     B->ops->matsolvetranspose = NULL;
1048   }
1049 
1050   /* get the triangular factors */
1051   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1056 {
1057   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1058   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1059   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1060   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1061   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1062   cusparseStatus_t                  stat;
1063   cusparseIndexBase_t               indexBase;
1064   cusparseMatrixType_t              matrixType;
1065   cusparseFillMode_t                fillMode;
1066   cusparseDiagType_t                diagType;
1067   cudaError_t                       cerr;
1068   PetscErrorCode                    ierr;
1069 
1070   PetscFunctionBegin;
1071   /* allocate space for the transpose of the lower triangular factor */
1072   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1073   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1074 
1075   /* set the matrix descriptors of the lower triangular factor */
1076   matrixType = cusparseGetMatType(loTriFactor->descr);
1077   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1078   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1079     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1080   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1081 
1082   /* Create the matrix description */
1083   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1084   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1085   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1086   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1087   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1088 
1089   /* set the operation */
1090   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1091 
1092   /* allocate GPU space for the CSC of the lower triangular factor*/
1093   loTriFactorT->csrMat = new CsrMatrix;
1094   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1095   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1096   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1097   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1098   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1099   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1100 
1101   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1102 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1103   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1104                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1105                                        loTriFactor->csrMat->values->data().get(),
1106                                        loTriFactor->csrMat->row_offsets->data().get(),
1107                                        loTriFactor->csrMat->column_indices->data().get(),
1108                                        loTriFactorT->csrMat->values->data().get(),
1109                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1110                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1111                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1112   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1113 #endif
1114 
1115   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1116   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1117                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1118                           loTriFactor->csrMat->values->data().get(),
1119                           loTriFactor->csrMat->row_offsets->data().get(),
1120                           loTriFactor->csrMat->column_indices->data().get(),
1121                           loTriFactorT->csrMat->values->data().get(),
1122                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1123                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1124                           CUSPARSE_ACTION_NUMERIC, indexBase,
1125                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1126                         #else
1127                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1128                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1129                         #endif
1130   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1131   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1132 
1133   /* Create the solve analysis information */
1134   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1135   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1136 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1137   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1138                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1139                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1140                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1141                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   /* perform the solve analysis */
1146   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1147                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1148                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1149                            loTriFactorT->csrMat->column_indices->data().get(),
1150                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1151                            loTriFactorT->solveInfo,
1152                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1153                           #else
1154                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1155                           #endif
1156   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1157   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158 
1159   /* assign the pointer */
1160   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1161 
1162   /*********************************************/
1163   /* Now the Transpose of the Upper Tri Factor */
1164   /*********************************************/
1165 
1166   /* allocate space for the transpose of the upper triangular factor */
1167   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1168   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1169 
1170   /* set the matrix descriptors of the upper triangular factor */
1171   matrixType = cusparseGetMatType(upTriFactor->descr);
1172   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1173   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1174     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1175   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1176 
1177   /* Create the matrix description */
1178   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1179   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1180   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1181   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1182   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1183 
1184   /* set the operation */
1185   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1186 
1187   /* allocate GPU space for the CSC of the upper triangular factor*/
1188   upTriFactorT->csrMat = new CsrMatrix;
1189   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1190   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1191   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1192   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1193   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1194   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1195 
1196   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1198   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1199                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1200                                 upTriFactor->csrMat->values->data().get(),
1201                                 upTriFactor->csrMat->row_offsets->data().get(),
1202                                 upTriFactor->csrMat->column_indices->data().get(),
1203                                 upTriFactorT->csrMat->values->data().get(),
1204                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1205                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1206                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1207   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1208 #endif
1209 
1210   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1212                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1213                           upTriFactor->csrMat->values->data().get(),
1214                           upTriFactor->csrMat->row_offsets->data().get(),
1215                           upTriFactor->csrMat->column_indices->data().get(),
1216                           upTriFactorT->csrMat->values->data().get(),
1217                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1218                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1219                           CUSPARSE_ACTION_NUMERIC, indexBase,
1220                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1221                         #else
1222                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1223                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1224                         #endif
1225 
1226   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1227   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1228 
1229   /* Create the solve analysis information */
1230   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1231   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1232   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1233   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1234                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1235                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1236                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1237                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1238   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1239   #endif
1240 
1241   /* perform the solve analysis */
1242   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1243                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1244                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1245                            upTriFactorT->csrMat->column_indices->data().get(),
1246                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1247                            upTriFactorT->solveInfo,
1248                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1249                           #else
1250                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1251                           #endif
1252 
1253   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1254   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1255 
1256   /* assign the pointer */
1257   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 struct PetscScalarToPetscInt
1262 {
1263   __host__ __device__
1264   PetscInt operator()(PetscScalar s)
1265   {
1266     return (PetscInt)PetscRealPart(s);
1267   }
1268 };
1269 
1270 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1271 {
1272   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1273   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1274   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1275   cusparseStatus_t             stat;
1276   cusparseIndexBase_t          indexBase;
1277   cudaError_t                  err;
1278   PetscErrorCode               ierr;
1279 
1280   PetscFunctionBegin;
1281   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1282   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1283   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1284   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1285   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1286   if (A->transupdated) PetscFunctionReturn(0);
1287   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1288   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1289   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1290     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1291   }
1292   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1293     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1294     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1295     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1296     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1297     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1298 
1299     /* set alpha and beta */
1300     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1301     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1302     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1303     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1304     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1305     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1306 
1307     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1308       CsrMatrix *matrixT = new CsrMatrix;
1309       matstructT->mat = matrixT;
1310       matrixT->num_rows = A->cmap->n;
1311       matrixT->num_cols = A->rmap->n;
1312       matrixT->num_entries = a->nz;
1313       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1314       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1315       matrixT->values = new THRUSTARRAY(a->nz);
1316 
1317       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1318       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1319 
1320      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1321       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1322         stat = cusparseCreateCsr(&matstructT->matDescr,
1323                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1324                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1325                                matrixT->values->data().get(),
1326                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1327                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1328       #else
1329         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1330            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1331 
1332            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1333            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1334            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1335         */
1336         if (matrixT->num_entries) {
1337           stat = cusparseCreateCsr(&matstructT->matDescr,
1338                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1339                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1340                                  matrixT->values->data().get(),
1341                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1342                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1343 
1344         } else {
1345           matstructT->matDescr = NULL;
1346           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1347         }
1348       #endif
1349      #endif
1350     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1351    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1352       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1353    #else
1354       CsrMatrix *temp  = new CsrMatrix;
1355       CsrMatrix *tempT = new CsrMatrix;
1356       /* First convert HYB to CSR */
1357       temp->num_rows = A->rmap->n;
1358       temp->num_cols = A->cmap->n;
1359       temp->num_entries = a->nz;
1360       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1361       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1362       temp->values = new THRUSTARRAY(a->nz);
1363 
1364       stat = cusparse_hyb2csr(cusparsestruct->handle,
1365                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1366                               temp->values->data().get(),
1367                               temp->row_offsets->data().get(),
1368                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1369 
1370       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1371       tempT->num_rows = A->rmap->n;
1372       tempT->num_cols = A->cmap->n;
1373       tempT->num_entries = a->nz;
1374       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1375       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1376       tempT->values = new THRUSTARRAY(a->nz);
1377 
1378       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1379                               temp->num_cols, temp->num_entries,
1380                               temp->values->data().get(),
1381                               temp->row_offsets->data().get(),
1382                               temp->column_indices->data().get(),
1383                               tempT->values->data().get(),
1384                               tempT->column_indices->data().get(),
1385                               tempT->row_offsets->data().get(),
1386                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1387 
1388       /* Last, convert CSC to HYB */
1389       cusparseHybMat_t hybMat;
1390       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1391       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1392         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1393       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1394                               matstructT->descr, tempT->values->data().get(),
1395                               tempT->row_offsets->data().get(),
1396                               tempT->column_indices->data().get(),
1397                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1398 
1399       /* assign the pointer */
1400       matstructT->mat = hybMat;
1401       A->transupdated = PETSC_TRUE;
1402       /* delete temporaries */
1403       if (tempT) {
1404         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1405         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1406         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1407         delete (CsrMatrix*) tempT;
1408       }
1409       if (temp) {
1410         if (temp->values) delete (THRUSTARRAY*) temp->values;
1411         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1412         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1413         delete (CsrMatrix*) temp;
1414       }
1415      #endif
1416     }
1417   }
1418   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1419     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1420     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1421     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1422     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1423     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1424     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1425     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1426     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1427     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1428     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1429     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1430       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1431       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1432       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1433     }
1434     if (!cusparsestruct->csr2csc_i) {
1435       THRUSTARRAY csr2csc_a(matrix->num_entries);
1436       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1437 
1438       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1439      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1440       void   *csr2cscBuffer;
1441       size_t csr2cscBufferSize;
1442       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1443                                            A->cmap->n, matrix->num_entries,
1444                                            matrix->values->data().get(),
1445                                            cusparsestruct->rowoffsets_gpu->data().get(),
1446                                            matrix->column_indices->data().get(),
1447                                            matrixT->values->data().get(),
1448                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1449                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1450                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1451       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1452      #endif
1453 
1454       if (matrix->num_entries) {
1455         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1456            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1457            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1458 
1459            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1460            should be filled with indexBase. So I just take a shortcut here.
1461         */
1462         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1463                               A->cmap->n,matrix->num_entries,
1464                               csr2csc_a.data().get(),
1465                               cusparsestruct->rowoffsets_gpu->data().get(),
1466                               matrix->column_indices->data().get(),
1467                               matrixT->values->data().get(),
1468                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1469                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1470                               CUSPARSE_ACTION_NUMERIC,indexBase,
1471                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1472                              #else
1473                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1474                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1475                              #endif
1476       } else {
1477         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1478       }
1479 
1480       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1481       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1482      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1483       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1484      #endif
1485     }
1486     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1487                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1488                                                      matrixT->values->begin()));
1489   }
1490   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1491   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1492   /* the compressed row indices is not used for matTranspose */
1493   matstructT->cprowIndices = NULL;
1494   /* assign the pointer */
1495   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1496   A->transupdated = PETSC_TRUE;
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1501 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1502 {
1503   PetscInt                              n = xx->map->n;
1504   const PetscScalar                     *barray;
1505   PetscScalar                           *xarray;
1506   thrust::device_ptr<const PetscScalar> bGPU;
1507   thrust::device_ptr<PetscScalar>       xGPU;
1508   cusparseStatus_t                      stat;
1509   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1510   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1511   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1512   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1513   PetscErrorCode                        ierr;
1514 
1515   PetscFunctionBegin;
1516   /* Analyze the matrix and create the transpose ... on the fly */
1517   if (!loTriFactorT && !upTriFactorT) {
1518     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1519     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1520     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1521   }
1522 
1523   /* Get the GPU pointers */
1524   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1525   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1526   xGPU = thrust::device_pointer_cast(xarray);
1527   bGPU = thrust::device_pointer_cast(barray);
1528 
1529   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1530   /* First, reorder with the row permutation */
1531   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1532                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1533                xGPU);
1534 
1535   /* First, solve U */
1536   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1537                         upTriFactorT->csrMat->num_rows,
1538                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539                         upTriFactorT->csrMat->num_entries,
1540                       #endif
1541                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1542                         upTriFactorT->csrMat->values->data().get(),
1543                         upTriFactorT->csrMat->row_offsets->data().get(),
1544                         upTriFactorT->csrMat->column_indices->data().get(),
1545                         upTriFactorT->solveInfo,
1546                         xarray,
1547                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548                         tempGPU->data().get(),
1549                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1550                       #else
1551                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1552                       #endif
1553 
1554   /* Then, solve L */
1555   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1556                         loTriFactorT->csrMat->num_rows,
1557                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1558                         loTriFactorT->csrMat->num_entries,
1559                       #endif
1560                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1561                         loTriFactorT->csrMat->values->data().get(),
1562                         loTriFactorT->csrMat->row_offsets->data().get(),
1563                         loTriFactorT->csrMat->column_indices->data().get(),
1564                         loTriFactorT->solveInfo,
1565                         tempGPU->data().get(),
1566                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1567                         xarray,
1568                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1569                       #else
1570                          xarray);CHKERRCUSPARSE(stat);
1571                       #endif
1572 
1573   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1575                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1576                tempGPU->begin());
1577 
1578   /* Copy the temporary to the full solution. */
1579   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1580 
1581   /* restore */
1582   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1583   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1584   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1590 {
1591   const PetscScalar                 *barray;
1592   PetscScalar                       *xarray;
1593   cusparseStatus_t                  stat;
1594   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1595   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1596   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1597   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1598   PetscErrorCode                    ierr;
1599 
1600   PetscFunctionBegin;
1601   /* Analyze the matrix and create the transpose ... on the fly */
1602   if (!loTriFactorT && !upTriFactorT) {
1603     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1604     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1605     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1606   }
1607 
1608   /* Get the GPU pointers */
1609   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1610   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1611 
1612   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1613   /* First, solve U */
1614   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1615                         upTriFactorT->csrMat->num_rows,
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         upTriFactorT->csrMat->num_entries,
1618                       #endif
1619                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1620                         upTriFactorT->csrMat->values->data().get(),
1621                         upTriFactorT->csrMat->row_offsets->data().get(),
1622                         upTriFactorT->csrMat->column_indices->data().get(),
1623                         upTriFactorT->solveInfo,
1624                         barray,
1625                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1626                         tempGPU->data().get(),
1627                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1628                       #else
1629                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1630                       #endif
1631 
1632   /* Then, solve L */
1633   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1634                         loTriFactorT->csrMat->num_rows,
1635                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1636                         loTriFactorT->csrMat->num_entries,
1637                       #endif
1638                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1639                         loTriFactorT->csrMat->values->data().get(),
1640                         loTriFactorT->csrMat->row_offsets->data().get(),
1641                         loTriFactorT->csrMat->column_indices->data().get(),
1642                         loTriFactorT->solveInfo,
1643                         tempGPU->data().get(),
1644                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1645                         xarray,
1646                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1647                       #else
1648                         xarray);CHKERRCUSPARSE(stat);
1649                       #endif
1650 
1651   /* restore */
1652   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1653   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1654   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1655   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1660 {
1661   const PetscScalar                     *barray;
1662   PetscScalar                           *xarray;
1663   thrust::device_ptr<const PetscScalar> bGPU;
1664   thrust::device_ptr<PetscScalar>       xGPU;
1665   cusparseStatus_t                      stat;
1666   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1667   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1668   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1669   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1670   PetscErrorCode                        ierr;
1671 
1672   PetscFunctionBegin;
1673 
1674   /* Get the GPU pointers */
1675   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1676   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1677   xGPU = thrust::device_pointer_cast(xarray);
1678   bGPU = thrust::device_pointer_cast(barray);
1679 
1680   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1681   /* First, reorder with the row permutation */
1682   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1683                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1684                tempGPU->begin());
1685 
1686   /* Next, solve L */
1687   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1688                         loTriFactor->csrMat->num_rows,
1689                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1690                         loTriFactor->csrMat->num_entries,
1691                       #endif
1692                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1693                         loTriFactor->csrMat->values->data().get(),
1694                         loTriFactor->csrMat->row_offsets->data().get(),
1695                         loTriFactor->csrMat->column_indices->data().get(),
1696                         loTriFactor->solveInfo,
1697                         tempGPU->data().get(),
1698                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1699                          xarray,
1700                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1701                       #else
1702                          xarray);CHKERRCUSPARSE(stat);
1703                       #endif
1704 
1705   /* Then, solve U */
1706   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1707                         upTriFactor->csrMat->num_rows,
1708                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1709                         upTriFactor->csrMat->num_entries,
1710                       #endif
1711                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1712                         upTriFactor->csrMat->values->data().get(),
1713                         upTriFactor->csrMat->row_offsets->data().get(),
1714                         upTriFactor->csrMat->column_indices->data().get(),
1715                         upTriFactor->solveInfo,xarray,
1716                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1717                         tempGPU->data().get(),
1718                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1719                       #else
1720                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1721                       #endif
1722 
1723   /* Last, reorder with the column permutation */
1724   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1725                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1726                xGPU);
1727 
1728   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1729   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1730   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1731   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1736 {
1737   const PetscScalar                 *barray;
1738   PetscScalar                       *xarray;
1739   cusparseStatus_t                  stat;
1740   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1741   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1742   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1743   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1744   PetscErrorCode                    ierr;
1745 
1746   PetscFunctionBegin;
1747   /* Get the GPU pointers */
1748   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1749   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1750 
1751   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1752   /* First, solve L */
1753   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1754                         loTriFactor->csrMat->num_rows,
1755                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1756                         loTriFactor->csrMat->num_entries,
1757                       #endif
1758                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1759                         loTriFactor->csrMat->values->data().get(),
1760                         loTriFactor->csrMat->row_offsets->data().get(),
1761                         loTriFactor->csrMat->column_indices->data().get(),
1762                         loTriFactor->solveInfo,
1763                         barray,
1764                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1765                         tempGPU->data().get(),
1766                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1767                       #else
1768                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1769                       #endif
1770 
1771   /* Next, solve U */
1772   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1773                         upTriFactor->csrMat->num_rows,
1774                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1775                         upTriFactor->csrMat->num_entries,
1776                       #endif
1777                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1778                         upTriFactor->csrMat->values->data().get(),
1779                         upTriFactor->csrMat->row_offsets->data().get(),
1780                         upTriFactor->csrMat->column_indices->data().get(),
1781                         upTriFactor->solveInfo,
1782                         tempGPU->data().get(),
1783                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1784                         xarray,
1785                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1786                       #else
1787                         xarray);CHKERRCUSPARSE(stat);
1788                       #endif
1789 
1790   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1791   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1792   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1793   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1794   PetscFunctionReturn(0);
1795 }
1796 
1797 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1798 {
1799   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1800   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1801   cudaError_t        cerr;
1802   PetscErrorCode     ierr;
1803 
1804   PetscFunctionBegin;
1805   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1806     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1807 
1808     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1809     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1810     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1811     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1812     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1813     A->offloadmask = PETSC_OFFLOAD_BOTH;
1814   }
1815   PetscFunctionReturn(0);
1816 }
1817 
1818 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1819 {
1820   PetscErrorCode ierr;
1821 
1822   PetscFunctionBegin;
1823   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1824   *array = ((Mat_SeqAIJ*)A->data)->a;
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1829 {
1830   PetscFunctionBegin;
1831   A->offloadmask = PETSC_OFFLOAD_CPU;
1832   *array         = NULL;
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1837 {
1838   PetscErrorCode ierr;
1839 
1840   PetscFunctionBegin;
1841   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1842   *array = ((Mat_SeqAIJ*)A->data)->a;
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1847 {
1848   PetscFunctionBegin;
1849   *array = NULL;
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1854 {
1855   PetscFunctionBegin;
1856   *array = ((Mat_SeqAIJ*)A->data)->a;
1857   PetscFunctionReturn(0);
1858 }
1859 
1860 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1861 {
1862   PetscFunctionBegin;
1863   A->offloadmask = PETSC_OFFLOAD_CPU;
1864   *array         = NULL;
1865   PetscFunctionReturn(0);
1866 }
1867 
1868 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1869 {
1870   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1871   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1872   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1873   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1874   PetscErrorCode               ierr;
1875   cusparseStatus_t             stat;
1876   PetscBool                    both = PETSC_TRUE;
1877   cudaError_t                  err;
1878 
1879   PetscFunctionBegin;
1880   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1881   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1882     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1883       CsrMatrix *matrix;
1884       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1885 
1886       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1887       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888       matrix->values->assign(a->a, a->a+a->nz);
1889       err  = WaitForCUDA();CHKERRCUDA(err);
1890       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1891       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1892       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1893     } else {
1894       PetscInt nnz;
1895       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1896       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1897       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1898       delete cusparsestruct->workVector;
1899       delete cusparsestruct->rowoffsets_gpu;
1900       cusparsestruct->workVector = NULL;
1901       cusparsestruct->rowoffsets_gpu = NULL;
1902       try {
1903         if (a->compressedrow.use) {
1904           m    = a->compressedrow.nrows;
1905           ii   = a->compressedrow.i;
1906           ridx = a->compressedrow.rindex;
1907         } else {
1908           m    = A->rmap->n;
1909           ii   = a->i;
1910           ridx = NULL;
1911         }
1912         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1913         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1914         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1915         else nnz = a->nz;
1916 
1917         /* create cusparse matrix */
1918         cusparsestruct->nrows = m;
1919         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1920         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1921         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1922         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1923 
1924         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1925         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1926         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1927         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1928         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1929         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1930         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1931 
1932         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1933         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1934           /* set the matrix */
1935           CsrMatrix *mat= new CsrMatrix;
1936           mat->num_rows = m;
1937           mat->num_cols = A->cmap->n;
1938           mat->num_entries = nnz;
1939           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1940           mat->row_offsets->assign(ii, ii + m+1);
1941 
1942           mat->column_indices = new THRUSTINTARRAY32(nnz);
1943           mat->column_indices->assign(a->j, a->j+nnz);
1944 
1945           mat->values = new THRUSTARRAY(nnz);
1946           if (a->a) mat->values->assign(a->a, a->a+nnz);
1947 
1948           /* assign the pointer */
1949           matstruct->mat = mat;
1950          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1952             stat = cusparseCreateCsr(&matstruct->matDescr,
1953                                     mat->num_rows, mat->num_cols, mat->num_entries,
1954                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1955                                     mat->values->data().get(),
1956                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1957                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1958           }
1959          #endif
1960         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1961          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1962           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1963          #else
1964           CsrMatrix *mat= new CsrMatrix;
1965           mat->num_rows = m;
1966           mat->num_cols = A->cmap->n;
1967           mat->num_entries = nnz;
1968           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1969           mat->row_offsets->assign(ii, ii + m+1);
1970 
1971           mat->column_indices = new THRUSTINTARRAY32(nnz);
1972           mat->column_indices->assign(a->j, a->j+nnz);
1973 
1974           mat->values = new THRUSTARRAY(nnz);
1975           if (a->a) mat->values->assign(a->a, a->a+nnz);
1976 
1977           cusparseHybMat_t hybMat;
1978           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1979           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1980             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1981           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1982               matstruct->descr, mat->values->data().get(),
1983               mat->row_offsets->data().get(),
1984               mat->column_indices->data().get(),
1985               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1986           /* assign the pointer */
1987           matstruct->mat = hybMat;
1988 
1989           if (mat) {
1990             if (mat->values) delete (THRUSTARRAY*)mat->values;
1991             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1992             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1993             delete (CsrMatrix*)mat;
1994           }
1995          #endif
1996         }
1997 
1998         /* assign the compressed row indices */
1999         if (a->compressedrow.use) {
2000           cusparsestruct->workVector = new THRUSTARRAY(m);
2001           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2002           matstruct->cprowIndices->assign(ridx,ridx+m);
2003           tmp = m;
2004         } else {
2005           cusparsestruct->workVector = NULL;
2006           matstruct->cprowIndices    = NULL;
2007           tmp = 0;
2008         }
2009         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2010 
2011         /* assign the pointer */
2012         cusparsestruct->mat = matstruct;
2013       } catch(char *ex) {
2014         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2015       }
2016       err  = WaitForCUDA();CHKERRCUDA(err);
2017       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2018       cusparsestruct->nonzerostate = A->nonzerostate;
2019     }
2020     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2021   }
2022   PetscFunctionReturn(0);
2023 }
2024 
2025 struct VecCUDAPlusEquals
2026 {
2027   template <typename Tuple>
2028   __host__ __device__
2029   void operator()(Tuple t)
2030   {
2031     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2032   }
2033 };
2034 
2035 struct VecCUDAEquals
2036 {
2037   template <typename Tuple>
2038   __host__ __device__
2039   void operator()(Tuple t)
2040   {
2041     thrust::get<1>(t) = thrust::get<0>(t);
2042   }
2043 };
2044 
2045 struct VecCUDAEqualsReverse
2046 {
2047   template <typename Tuple>
2048   __host__ __device__
2049   void operator()(Tuple t)
2050   {
2051     thrust::get<0>(t) = thrust::get<1>(t);
2052   }
2053 };
2054 
2055 struct MatMatCusparse {
2056   PetscBool             cisdense;
2057   PetscScalar           *Bt;
2058   Mat                   X;
2059   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2060   PetscLogDouble        flops;
2061   CsrMatrix             *Bcsr;
2062 
2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2064   cusparseSpMatDescr_t  matSpBDescr;
2065   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2066   cusparseDnMatDescr_t  matBDescr;
2067   cusparseDnMatDescr_t  matCDescr;
2068   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2069  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2070   void                  *dBuffer4;
2071   void                  *dBuffer5;
2072  #endif
2073   size_t                mmBufferSize;
2074   void                  *mmBuffer;
2075   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2076   cusparseSpGEMMDescr_t spgemmDesc;
2077 #endif
2078 };
2079 
2080 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2081 {
2082   PetscErrorCode   ierr;
2083   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2084   cudaError_t      cerr;
2085  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2086   cusparseStatus_t stat;
2087  #endif
2088 
2089   PetscFunctionBegin;
2090   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2091   delete mmdata->Bcsr;
2092  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2093   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2094   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2095   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2096   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2097  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2098   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2099   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2100  #endif
2101   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2102   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2103  #endif
2104   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2105   ierr = PetscFree(data);CHKERRQ(ierr);
2106   PetscFunctionReturn(0);
2107 }
2108 
2109 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2110 
2111 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2112 {
2113   Mat_Product                  *product = C->product;
2114   Mat                          A,B;
2115   PetscInt                     m,n,blda,clda;
2116   PetscBool                    flg,biscuda;
2117   Mat_SeqAIJCUSPARSE           *cusp;
2118   cusparseStatus_t             stat;
2119   cusparseOperation_t          opA;
2120   const PetscScalar            *barray;
2121   PetscScalar                  *carray;
2122   PetscErrorCode               ierr;
2123   MatMatCusparse               *mmdata;
2124   Mat_SeqAIJCUSPARSEMultStruct *mat;
2125   CsrMatrix                    *csrmat;
2126 
2127   PetscFunctionBegin;
2128   MatCheckProduct(C,1);
2129   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2130   mmdata = (MatMatCusparse*)product->data;
2131   A    = product->A;
2132   B    = product->B;
2133   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2134   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2135   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2136      Instead of silently accepting the wrong answer, I prefer to raise the error */
2137   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2138   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2139   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2140   switch (product->type) {
2141   case MATPRODUCT_AB:
2142   case MATPRODUCT_PtAP:
2143     mat = cusp->mat;
2144     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2145     m   = A->rmap->n;
2146     n   = B->cmap->n;
2147     break;
2148   case MATPRODUCT_AtB:
2149     if (!A->form_explicit_transpose) {
2150       mat = cusp->mat;
2151       opA = CUSPARSE_OPERATION_TRANSPOSE;
2152     } else {
2153       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2154       mat  = cusp->matTranspose;
2155       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2156     }
2157     m = A->cmap->n;
2158     n = B->cmap->n;
2159     break;
2160   case MATPRODUCT_ABt:
2161   case MATPRODUCT_RARt:
2162     mat = cusp->mat;
2163     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2164     m   = A->rmap->n;
2165     n   = B->rmap->n;
2166     break;
2167   default:
2168     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2169   }
2170   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2171   csrmat = (CsrMatrix*)mat->mat;
2172   /* if the user passed a CPU matrix, copy the data to the GPU */
2173   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2174   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2175   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2176 
2177   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2178   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2179     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2180     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2181   } else {
2182     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2183     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2184   }
2185 
2186   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2187  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2188   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2189   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2190   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2191     size_t mmBufferSize;
2192     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2193     if (!mmdata->matBDescr) {
2194       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2195       mmdata->Blda = blda;
2196     }
2197 
2198     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2199     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2200       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2201       mmdata->Clda = clda;
2202     }
2203 
2204     if (!mat->matDescr) {
2205       stat = cusparseCreateCsr(&mat->matDescr,
2206                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2207                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2208                                csrmat->values->data().get(),
2209                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2210                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2211     }
2212     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2213                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2214                                    mmdata->matCDescr,cusparse_scalartype,
2215                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2216     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2217       cudaError_t cerr;
2218       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2219       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2220       mmdata->mmBufferSize = mmBufferSize;
2221     }
2222     mmdata->initialized = PETSC_TRUE;
2223   } else {
2224     /* to be safe, always update pointers of the mats */
2225     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2226     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2227     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2228   }
2229 
2230   /* do cusparseSpMM, which supports transpose on B */
2231   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2232                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2233                       mmdata->matCDescr,cusparse_scalartype,
2234                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2235  #else
2236   PetscInt k;
2237   /* cusparseXcsrmm does not support transpose on B */
2238   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2239     cublasHandle_t cublasv2handle;
2240     cublasStatus_t cerr;
2241 
2242     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2243     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2244                        B->cmap->n,B->rmap->n,
2245                        &PETSC_CUSPARSE_ONE ,barray,blda,
2246                        &PETSC_CUSPARSE_ZERO,barray,blda,
2247                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2248     blda = B->cmap->n;
2249     k    = B->cmap->n;
2250   } else {
2251     k    = B->rmap->n;
2252   }
2253 
2254   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2255   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2256                            csrmat->num_entries,mat->alpha_one,mat->descr,
2257                            csrmat->values->data().get(),
2258                            csrmat->row_offsets->data().get(),
2259                            csrmat->column_indices->data().get(),
2260                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2261                            carray,clda);CHKERRCUSPARSE(stat);
2262  #endif
2263   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2264   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2265   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2266   if (product->type == MATPRODUCT_RARt) {
2267     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2268     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2269   } else if (product->type == MATPRODUCT_PtAP) {
2270     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2271     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2272   } else {
2273     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2274   }
2275   if (mmdata->cisdense) {
2276     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2277   }
2278   if (!biscuda) {
2279     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2280   }
2281   PetscFunctionReturn(0);
2282 }
2283 
2284 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2285 {
2286   Mat_Product        *product = C->product;
2287   Mat                A,B;
2288   PetscInt           m,n;
2289   PetscBool          cisdense,flg;
2290   PetscErrorCode     ierr;
2291   MatMatCusparse     *mmdata;
2292   Mat_SeqAIJCUSPARSE *cusp;
2293 
2294   PetscFunctionBegin;
2295   MatCheckProduct(C,1);
2296   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2297   A    = product->A;
2298   B    = product->B;
2299   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2300   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2301   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2302   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2303   switch (product->type) {
2304   case MATPRODUCT_AB:
2305     m = A->rmap->n;
2306     n = B->cmap->n;
2307     break;
2308   case MATPRODUCT_AtB:
2309     m = A->cmap->n;
2310     n = B->cmap->n;
2311     break;
2312   case MATPRODUCT_ABt:
2313     m = A->rmap->n;
2314     n = B->rmap->n;
2315     break;
2316   case MATPRODUCT_PtAP:
2317     m = B->cmap->n;
2318     n = B->cmap->n;
2319     break;
2320   case MATPRODUCT_RARt:
2321     m = B->rmap->n;
2322     n = B->rmap->n;
2323     break;
2324   default:
2325     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2326   }
2327   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2328   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2329   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2330   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2331 
2332   /* product data */
2333   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2334   mmdata->cisdense = cisdense;
2335  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2336   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2337   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2338     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2339   }
2340  #endif
2341   /* for these products we need intermediate storage */
2342   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2343     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2344     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2345     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2346       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2347     } else {
2348       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2349     }
2350   }
2351   C->product->data    = mmdata;
2352   C->product->destroy = MatDestroy_MatMatCusparse;
2353 
2354   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2355   PetscFunctionReturn(0);
2356 }
2357 
2358 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2359 {
2360   Mat_Product                  *product = C->product;
2361   Mat                          A,B;
2362   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2363   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2364   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2365   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2366   PetscBool                    flg;
2367   PetscErrorCode               ierr;
2368   cusparseStatus_t             stat;
2369   cudaError_t                  cerr;
2370   MatProductType               ptype;
2371   MatMatCusparse               *mmdata;
2372 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2373   cusparseSpMatDescr_t         BmatSpDescr;
2374 #endif
2375   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2376 
2377   PetscFunctionBegin;
2378   MatCheckProduct(C,1);
2379   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2380   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2381   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2382   mmdata = (MatMatCusparse*)C->product->data;
2383   A = product->A;
2384   B = product->B;
2385   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2386     mmdata->reusesym = PETSC_FALSE;
2387     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2388     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2389     Cmat = Ccusp->mat;
2390     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2391     Ccsr = (CsrMatrix*)Cmat->mat;
2392     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2393     goto finalize;
2394   }
2395   if (!c->nz) goto finalize;
2396   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2397   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2398   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2399   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2400   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2401   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2402   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2403   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2404   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2405   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2406   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2407   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2408   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2409   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2410 
2411   ptype = product->type;
2412   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2413     ptype = MATPRODUCT_AB;
2414     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2415   }
2416   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2417     ptype = MATPRODUCT_AB;
2418     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2419   }
2420   switch (ptype) {
2421   case MATPRODUCT_AB:
2422     Amat = Acusp->mat;
2423     Bmat = Bcusp->mat;
2424     break;
2425   case MATPRODUCT_AtB:
2426     Amat = Acusp->matTranspose;
2427     Bmat = Bcusp->mat;
2428     break;
2429   case MATPRODUCT_ABt:
2430     Amat = Acusp->mat;
2431     Bmat = Bcusp->matTranspose;
2432     break;
2433   default:
2434     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2435   }
2436   Cmat = Ccusp->mat;
2437   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2438   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2439   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2440   Acsr = (CsrMatrix*)Amat->mat;
2441   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2442   Ccsr = (CsrMatrix*)Cmat->mat;
2443   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2444   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2445   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2446   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2448   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2449   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2450   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2451     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2452                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2453                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2454                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2455   #else
2456     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2457                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2458                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2459                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2460     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2461                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2462                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2463   #endif
2464 #else
2465   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2466                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2467                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2468                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2469                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2470 #endif
2471   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2472   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2473   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2474   C->offloadmask = PETSC_OFFLOAD_GPU;
2475 finalize:
2476   /* shorter version of MatAssemblyEnd_SeqAIJ */
2477   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2478   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2479   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2480   c->reallocs         = 0;
2481   C->info.mallocs    += 0;
2482   C->info.nz_unneeded = 0;
2483   C->assembled = C->was_assembled = PETSC_TRUE;
2484   C->num_ass++;
2485   PetscFunctionReturn(0);
2486 }
2487 
2488 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2489 {
2490   Mat_Product                  *product = C->product;
2491   Mat                          A,B;
2492   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2493   Mat_SeqAIJ                   *a,*b,*c;
2494   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2495   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2496   PetscInt                     i,j,m,n,k;
2497   PetscBool                    flg;
2498   PetscErrorCode               ierr;
2499   cusparseStatus_t             stat;
2500   cudaError_t                  cerr;
2501   MatProductType               ptype;
2502   MatMatCusparse               *mmdata;
2503   PetscLogDouble               flops;
2504   PetscBool                    biscompressed,ciscompressed;
2505 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2506   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2507   cusparseSpMatDescr_t         BmatSpDescr;
2508 #else
2509   int                          cnz;
2510 #endif
2511   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2512 
2513   PetscFunctionBegin;
2514   MatCheckProduct(C,1);
2515   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2516   A    = product->A;
2517   B    = product->B;
2518   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2519   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2520   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2521   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2522   a = (Mat_SeqAIJ*)A->data;
2523   b = (Mat_SeqAIJ*)B->data;
2524   /* product data */
2525   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2526   C->product->data    = mmdata;
2527   C->product->destroy = MatDestroy_MatMatCusparse;
2528 
2529   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2530   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2531   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2532   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2533   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2534   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2535 
2536   ptype = product->type;
2537   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2538     ptype = MATPRODUCT_AB;
2539     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2540   }
2541   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2542     ptype = MATPRODUCT_AB;
2543     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2544   }
2545   biscompressed = PETSC_FALSE;
2546   ciscompressed = PETSC_FALSE;
2547   switch (ptype) {
2548   case MATPRODUCT_AB:
2549     m = A->rmap->n;
2550     n = B->cmap->n;
2551     k = A->cmap->n;
2552     Amat = Acusp->mat;
2553     Bmat = Bcusp->mat;
2554     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2555     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2556     break;
2557   case MATPRODUCT_AtB:
2558     m = A->cmap->n;
2559     n = B->cmap->n;
2560     k = A->rmap->n;
2561     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2562     Amat = Acusp->matTranspose;
2563     Bmat = Bcusp->mat;
2564     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2565     break;
2566   case MATPRODUCT_ABt:
2567     m = A->rmap->n;
2568     n = B->rmap->n;
2569     k = A->cmap->n;
2570     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2571     Amat = Acusp->mat;
2572     Bmat = Bcusp->matTranspose;
2573     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2574     break;
2575   default:
2576     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2577   }
2578 
2579   /* create cusparse matrix */
2580   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2581   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2582   c     = (Mat_SeqAIJ*)C->data;
2583   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2584   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2585   Ccsr  = new CsrMatrix;
2586 
2587   c->compressedrow.use = ciscompressed;
2588   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2589     c->compressedrow.nrows = a->compressedrow.nrows;
2590     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2591     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2592     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2593     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2594     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2595   } else {
2596     c->compressedrow.nrows  = 0;
2597     c->compressedrow.i      = NULL;
2598     c->compressedrow.rindex = NULL;
2599     Ccusp->workVector       = NULL;
2600     Cmat->cprowIndices      = NULL;
2601   }
2602   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2603   Ccusp->mat      = Cmat;
2604   Ccusp->mat->mat = Ccsr;
2605   Ccsr->num_rows    = Ccusp->nrows;
2606   Ccsr->num_cols    = n;
2607   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2608   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2609   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2610   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2611   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2612   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2613   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2614   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2615   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2616   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2617   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2618     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2619     c->nz = 0;
2620     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2621     Ccsr->values = new THRUSTARRAY(c->nz);
2622     goto finalizesym;
2623   }
2624 
2625   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2626   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2627   Acsr = (CsrMatrix*)Amat->mat;
2628   if (!biscompressed) {
2629     Bcsr = (CsrMatrix*)Bmat->mat;
2630 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2631     BmatSpDescr = Bmat->matDescr;
2632 #endif
2633   } else { /* we need to use row offsets for the full matrix */
2634     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2635     Bcsr = new CsrMatrix;
2636     Bcsr->num_rows       = B->rmap->n;
2637     Bcsr->num_cols       = cBcsr->num_cols;
2638     Bcsr->num_entries    = cBcsr->num_entries;
2639     Bcsr->column_indices = cBcsr->column_indices;
2640     Bcsr->values         = cBcsr->values;
2641     if (!Bcusp->rowoffsets_gpu) {
2642       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2643       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2644       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2645     }
2646     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2647     mmdata->Bcsr = Bcsr;
2648 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2649     if (Bcsr->num_rows && Bcsr->num_cols) {
2650       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2651                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2652                                Bcsr->values->data().get(),
2653                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2654                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2655     }
2656     BmatSpDescr = mmdata->matSpBDescr;
2657 #endif
2658   }
2659   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2660   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2661   /* precompute flops count */
2662   if (ptype == MATPRODUCT_AB) {
2663     for (i=0, flops = 0; i<A->rmap->n; i++) {
2664       const PetscInt st = a->i[i];
2665       const PetscInt en = a->i[i+1];
2666       for (j=st; j<en; j++) {
2667         const PetscInt brow = a->j[j];
2668         flops += 2.*(b->i[brow+1] - b->i[brow]);
2669       }
2670     }
2671   } else if (ptype == MATPRODUCT_AtB) {
2672     for (i=0, flops = 0; i<A->rmap->n; i++) {
2673       const PetscInt anzi = a->i[i+1] - a->i[i];
2674       const PetscInt bnzi = b->i[i+1] - b->i[i];
2675       flops += (2.*anzi)*bnzi;
2676     }
2677   } else { /* TODO */
2678     flops = 0.;
2679   }
2680 
2681   mmdata->flops = flops;
2682   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2683 
2684 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2685   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2686   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2687                           NULL, NULL, NULL,
2688                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2689                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2690   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2691  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2692  {
2693   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2694      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2695   */
2696   void*  dBuffer1 = NULL;
2697   void*  dBuffer2 = NULL;
2698   void*  dBuffer3 = NULL;
2699   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2700   size_t bufferSize1 = 0;
2701   size_t bufferSize2 = 0;
2702   size_t bufferSize3 = 0;
2703   size_t bufferSize4 = 0;
2704   size_t bufferSize5 = 0;
2705 
2706   /*----------------------------------------------------------------------*/
2707   /* ask bufferSize1 bytes for external memory */
2708   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2709                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2710                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2711   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2712   /* inspect the matrices A and B to understand the memory requirement for the next step */
2713   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2714                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2715                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2716 
2717   /*----------------------------------------------------------------------*/
2718   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2719                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2720                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2721   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2722   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2723   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2724   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2725                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2726                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2727   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2728   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2729 
2730   /*----------------------------------------------------------------------*/
2731   /* get matrix C non-zero entries C_nnz1 */
2732   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2733   c->nz = (PetscInt) C_nnz1;
2734   /* allocate matrix C */
2735   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2737   /* update matC with the new pointers */
2738   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2739                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2740 
2741   /*----------------------------------------------------------------------*/
2742   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2743                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2744                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2745   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2746   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2747                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2748                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2749   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2750   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2751                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2752                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2753                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2754   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2755  }
2756  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2757   size_t bufSize2;
2758   /* ask bufferSize bytes for external memory */
2759   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2760                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2761                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2762                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2763   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2764   /* inspect the matrices A and B to understand the memory requirement for the next step */
2765   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2766                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2767                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2768                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2769   /* ask bufferSize again bytes for external memory */
2770   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2771                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2772                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2773                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2774   /* The CUSPARSE documentation is not clear, nor the API
2775      We need both buffers to perform the operations properly!
2776      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2777      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2778      is stored in the descriptor! What a messy API... */
2779   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2780   /* compute the intermediate product of A * B */
2781   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2782                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2783                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2784                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2785   /* get matrix C non-zero entries C_nnz1 */
2786   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2787   c->nz = (PetscInt) C_nnz1;
2788   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2789   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2790   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2791   Ccsr->values = new THRUSTARRAY(c->nz);
2792   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2793   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2794                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2795   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2796                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2797                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2798  #endif
2799 #else
2800   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2801   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2802                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2803                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2804                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2805                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2806   c->nz = cnz;
2807   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2808   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2809   Ccsr->values = new THRUSTARRAY(c->nz);
2810   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2811 
2812   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2813   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2814      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2815      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2816   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2817                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2818                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2819                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2820                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2821 #endif
2822   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2823   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2824 finalizesym:
2825   c->singlemalloc = PETSC_FALSE;
2826   c->free_a       = PETSC_TRUE;
2827   c->free_ij      = PETSC_TRUE;
2828   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2829   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2830   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2831     PetscInt *d_i = c->i;
2832     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2833     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2834     ii   = *Ccsr->row_offsets;
2835     jj   = *Ccsr->column_indices;
2836     if (ciscompressed) d_i = c->compressedrow.i;
2837     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2838     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839   } else {
2840     PetscInt *d_i = c->i;
2841     if (ciscompressed) d_i = c->compressedrow.i;
2842     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2843     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2844   }
2845   if (ciscompressed) { /* need to expand host row offsets */
2846     PetscInt r = 0;
2847     c->i[0] = 0;
2848     for (k = 0; k < c->compressedrow.nrows; k++) {
2849       const PetscInt next = c->compressedrow.rindex[k];
2850       const PetscInt old = c->compressedrow.i[k];
2851       for (; r < next; r++) c->i[r+1] = old;
2852     }
2853     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2854   }
2855   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2856   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2857   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2858   c->maxnz = c->nz;
2859   c->nonzerorowcnt = 0;
2860   c->rmax = 0;
2861   for (k = 0; k < m; k++) {
2862     const PetscInt nn = c->i[k+1] - c->i[k];
2863     c->ilen[k] = c->imax[k] = nn;
2864     c->nonzerorowcnt += (PetscInt)!!nn;
2865     c->rmax = PetscMax(c->rmax,nn);
2866   }
2867   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2868   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2869   Ccsr->num_entries = c->nz;
2870 
2871   C->nonzerostate++;
2872   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2873   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2874   Ccusp->nonzerostate = C->nonzerostate;
2875   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2876   C->preallocated  = PETSC_TRUE;
2877   C->assembled     = PETSC_FALSE;
2878   C->was_assembled = PETSC_FALSE;
2879   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2880     mmdata->reusesym = PETSC_TRUE;
2881     C->offloadmask   = PETSC_OFFLOAD_GPU;
2882   }
2883   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2884   PetscFunctionReturn(0);
2885 }
2886 
2887 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2888 
2889 /* handles sparse or dense B */
2890 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2891 {
2892   Mat_Product    *product = mat->product;
2893   PetscErrorCode ierr;
2894   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2895 
2896   PetscFunctionBegin;
2897   MatCheckProduct(mat,1);
2898   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2899   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2900     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2901   }
2902   if (product->type == MATPRODUCT_ABC) {
2903     Ciscusp = PETSC_FALSE;
2904     if (!product->C->boundtocpu) {
2905       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2906     }
2907   }
2908   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2909     PetscBool usecpu = PETSC_FALSE;
2910     switch (product->type) {
2911     case MATPRODUCT_AB:
2912       if (product->api_user) {
2913         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2914         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2915         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2916       } else {
2917         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2918         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2919         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2920       }
2921       break;
2922     case MATPRODUCT_AtB:
2923       if (product->api_user) {
2924         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2925         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2926         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2927       } else {
2928         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2929         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2930         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2931       }
2932       break;
2933     case MATPRODUCT_PtAP:
2934       if (product->api_user) {
2935         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2936         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2937         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2938       } else {
2939         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2940         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2941         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2942       }
2943       break;
2944     case MATPRODUCT_RARt:
2945       if (product->api_user) {
2946         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2947         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2948         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2949       } else {
2950         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2951         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2952         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2953       }
2954       break;
2955     case MATPRODUCT_ABC:
2956       if (product->api_user) {
2957         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2958         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2959         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2960       } else {
2961         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2962         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2963         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2964       }
2965       break;
2966     default:
2967       break;
2968     }
2969     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2970   }
2971   /* dispatch */
2972   if (isdense) {
2973     switch (product->type) {
2974     case MATPRODUCT_AB:
2975     case MATPRODUCT_AtB:
2976     case MATPRODUCT_ABt:
2977     case MATPRODUCT_PtAP:
2978     case MATPRODUCT_RARt:
2979      if (product->A->boundtocpu) {
2980         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2981       } else {
2982         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2983       }
2984       break;
2985     case MATPRODUCT_ABC:
2986       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2987       break;
2988     default:
2989       break;
2990     }
2991   } else if (Biscusp && Ciscusp) {
2992     switch (product->type) {
2993     case MATPRODUCT_AB:
2994     case MATPRODUCT_AtB:
2995     case MATPRODUCT_ABt:
2996       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2997       break;
2998     case MATPRODUCT_PtAP:
2999     case MATPRODUCT_RARt:
3000     case MATPRODUCT_ABC:
3001       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3002       break;
3003     default:
3004       break;
3005     }
3006   } else { /* fallback for AIJ */
3007     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3008   }
3009   PetscFunctionReturn(0);
3010 }
3011 
3012 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3013 {
3014   PetscErrorCode ierr;
3015 
3016   PetscFunctionBegin;
3017   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3018   PetscFunctionReturn(0);
3019 }
3020 
3021 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3022 {
3023   PetscErrorCode ierr;
3024 
3025   PetscFunctionBegin;
3026   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3027   PetscFunctionReturn(0);
3028 }
3029 
3030 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3031 {
3032   PetscErrorCode ierr;
3033 
3034   PetscFunctionBegin;
3035   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3036   PetscFunctionReturn(0);
3037 }
3038 
3039 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3040 {
3041   PetscErrorCode ierr;
3042 
3043   PetscFunctionBegin;
3044   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3045   PetscFunctionReturn(0);
3046 }
3047 
3048 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3049 {
3050   PetscErrorCode ierr;
3051 
3052   PetscFunctionBegin;
3053   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3054   PetscFunctionReturn(0);
3055 }
3056 
3057 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3058 {
3059   int i = blockIdx.x*blockDim.x + threadIdx.x;
3060   if (i < n) y[idx[i]] += x[i];
3061 }
3062 
3063 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3064 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3065 {
3066   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3067   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3068   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3069   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3070   PetscErrorCode               ierr;
3071   cusparseStatus_t             stat;
3072   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3073   PetscBool                    compressed;
3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3075   PetscInt                     nx,ny;
3076 #endif
3077 
3078   PetscFunctionBegin;
3079   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3080   if (!a->nonzerorowcnt) {
3081     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3082     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3083     PetscFunctionReturn(0);
3084   }
3085   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3086   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3087   if (!trans) {
3088     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3089     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3090   } else {
3091     if (herm || !A->form_explicit_transpose) {
3092       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3093       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3094     } else {
3095       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3096       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3097     }
3098   }
3099   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3100   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3101 
3102   try {
3103     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3104     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3105     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3106 
3107     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3108     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3109       /* z = A x + beta y.
3110          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3111          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3112       */
3113       xptr = xarray;
3114       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3115       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3116      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3117       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3118           allocated to accommodate different uses. So we get the length info directly from mat.
3119        */
3120       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3121         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3122         nx = mat->num_cols;
3123         ny = mat->num_rows;
3124       }
3125      #endif
3126     } else {
3127       /* z = A^T x + beta y
3128          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3129          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3130        */
3131       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3132       dptr = zarray;
3133       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3134       if (compressed) { /* Scatter x to work vector */
3135         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3136         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3137                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3138                          VecCUDAEqualsReverse());
3139       }
3140      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3141       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3142         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3143         nx = mat->num_rows;
3144         ny = mat->num_cols;
3145       }
3146      #endif
3147     }
3148 
3149     /* csr_spmv does y = alpha op(A) x + beta y */
3150     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3151      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3152       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3153       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3154         cudaError_t cerr;
3155         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3156         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3157         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3158                                 matstruct->matDescr,
3159                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3160                                 matstruct->cuSpMV[opA].vecYDescr,
3161                                 cusparse_scalartype,
3162                                 cusparsestruct->spmvAlg,
3163                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3164         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3165 
3166         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3167       } else {
3168         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3169         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3170         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3171       }
3172 
3173       stat = cusparseSpMV(cusparsestruct->handle, opA,
3174                                matstruct->alpha_one,
3175                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3176                                matstruct->cuSpMV[opA].vecXDescr,
3177                                beta,
3178                                matstruct->cuSpMV[opA].vecYDescr,
3179                                cusparse_scalartype,
3180                                cusparsestruct->spmvAlg,
3181                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3182      #else
3183       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3184       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3185                                mat->num_rows, mat->num_cols,
3186                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3187                                mat->values->data().get(), mat->row_offsets->data().get(),
3188                                mat->column_indices->data().get(), xptr, beta,
3189                                dptr);CHKERRCUSPARSE(stat);
3190      #endif
3191     } else {
3192       if (cusparsestruct->nrows) {
3193        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3194         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3195        #else
3196         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3197         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3198                                  matstruct->alpha_one, matstruct->descr, hybMat,
3199                                  xptr, beta,
3200                                  dptr);CHKERRCUSPARSE(stat);
3201        #endif
3202       }
3203     }
3204     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3205 
3206     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3207       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3208         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3209           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3210         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3211           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3212         }
3213       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3214         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3215       }
3216 
3217       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3218       if (compressed) {
3219         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3220         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3221            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3222            prevent that. So I just add a ScatterAdd kernel.
3223          */
3224        #if 0
3225         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3226         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3227                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3228                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3229                          VecCUDAPlusEquals());
3230        #else
3231         PetscInt n = matstruct->cprowIndices->size();
3232         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3233        #endif
3234         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3235       }
3236     } else {
3237       if (yy && yy != zz) {
3238         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3239       }
3240     }
3241     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3242     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3243     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3244   } catch(char *ex) {
3245     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3246   }
3247   if (yy) {
3248     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3249   } else {
3250     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3251   }
3252   PetscFunctionReturn(0);
3253 }
3254 
3255 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3256 {
3257   PetscErrorCode ierr;
3258 
3259   PetscFunctionBegin;
3260   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3261   PetscFunctionReturn(0);
3262 }
3263 
3264 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3265 {
3266   PetscErrorCode     ierr;
3267   PetscObjectState   onnz = A->nonzerostate;
3268   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3269 
3270   PetscFunctionBegin;
3271   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3272   if (onnz != A->nonzerostate && cusp->deviceMat) {
3273     cudaError_t cerr;
3274 
3275     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3276     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3277     cusp->deviceMat = NULL;
3278   }
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 /* --------------------------------------------------------------------------------*/
3283 /*@
3284    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3285    (the default parallel PETSc format). This matrix will ultimately pushed down
3286    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3287    assembly performance the user should preallocate the matrix storage by setting
3288    the parameter nz (or the array nnz).  By setting these parameters accurately,
3289    performance during matrix assembly can be increased by more than a factor of 50.
3290 
3291    Collective
3292 
3293    Input Parameters:
3294 +  comm - MPI communicator, set to PETSC_COMM_SELF
3295 .  m - number of rows
3296 .  n - number of columns
3297 .  nz - number of nonzeros per row (same for all rows)
3298 -  nnz - array containing the number of nonzeros in the various rows
3299          (possibly different for each row) or NULL
3300 
3301    Output Parameter:
3302 .  A - the matrix
3303 
3304    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3305    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3306    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3307 
3308    Notes:
3309    If nnz is given then nz is ignored
3310 
3311    The AIJ format (also called the Yale sparse matrix format or
3312    compressed row storage), is fully compatible with standard Fortran 77
3313    storage.  That is, the stored row and column indices can begin at
3314    either one (as in Fortran) or zero.  See the users' manual for details.
3315 
3316    Specify the preallocated storage with either nz or nnz (not both).
3317    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3318    allocation.  For large problems you MUST preallocate memory or you
3319    will get TERRIBLE performance, see the users' manual chapter on matrices.
3320 
3321    By default, this format uses inodes (identical nodes) when possible, to
3322    improve numerical efficiency of matrix-vector products and solves. We
3323    search for consecutive rows with the same nonzero structure, thereby
3324    reusing matrix information to achieve increased efficiency.
3325 
3326    Level: intermediate
3327 
3328 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3329 @*/
3330 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3331 {
3332   PetscErrorCode ierr;
3333 
3334   PetscFunctionBegin;
3335   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3336   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3337   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3338   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3343 {
3344   PetscErrorCode ierr;
3345 
3346   PetscFunctionBegin;
3347   if (A->factortype == MAT_FACTOR_NONE) {
3348     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3349   } else {
3350     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3351   }
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3357   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3358   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3359   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3360   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3361   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3362   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3363   PetscFunctionReturn(0);
3364 }
3365 
3366 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3367 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3368 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3369 {
3370   PetscErrorCode ierr;
3371 
3372   PetscFunctionBegin;
3373   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3374   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3375   PetscFunctionReturn(0);
3376 }
3377 
3378 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3379 {
3380   PetscErrorCode     ierr;
3381   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3382   Mat_SeqAIJCUSPARSE *cy;
3383   Mat_SeqAIJCUSPARSE *cx;
3384   PetscScalar        *ay;
3385   const PetscScalar  *ax;
3386   CsrMatrix          *csry,*csrx;
3387 
3388   PetscFunctionBegin;
3389   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3390   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3391   if (X->ops->axpy != Y->ops->axpy) {
3392     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3393     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3394     PetscFunctionReturn(0);
3395   }
3396   /* if we are here, it means both matrices are bound to GPU */
3397   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3398   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3399   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3400   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3401   csry = (CsrMatrix*)cy->mat->mat;
3402   csrx = (CsrMatrix*)cx->mat->mat;
3403   /* see if we can turn this into a cublas axpy */
3404   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3405     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3406     if (eq) {
3407       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3408     }
3409     if (eq) str = SAME_NONZERO_PATTERN;
3410   }
3411   /* spgeam is buggy with one column */
3412   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3413 
3414   if (str == SUBSET_NONZERO_PATTERN) {
3415     cusparseStatus_t stat;
3416     PetscScalar      b = 1.0;
3417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3418     size_t           bufferSize;
3419     void             *buffer;
3420     cudaError_t      cerr;
3421 #endif
3422 
3423     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3424     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3425     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3426 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3427     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3428                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3429                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3430                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3431     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3432     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3433     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3434                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3435                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3436                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3437     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3438     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3439     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3440 #else
3441     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3442     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3443                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3444                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3445                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3446     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3447     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3448 #endif
3449     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3450     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3451     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3452     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3453   } else if (str == SAME_NONZERO_PATTERN) {
3454     cublasHandle_t cublasv2handle;
3455     cublasStatus_t berr;
3456     PetscBLASInt   one = 1, bnz = 1;
3457 
3458     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3459     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3460     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3461     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3462     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3463     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3464     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3465     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3466     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3467     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3468     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3469   } else {
3470     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3471     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3472   }
3473   PetscFunctionReturn(0);
3474 }
3475 
3476 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3477 {
3478   PetscErrorCode ierr;
3479   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3480   PetscScalar    *ay;
3481   cublasHandle_t cublasv2handle;
3482   cublasStatus_t berr;
3483   PetscBLASInt   one = 1, bnz = 1;
3484 
3485   PetscFunctionBegin;
3486   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3487   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3488   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3489   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3490   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3491   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3492   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3493   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3494   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3495   PetscFunctionReturn(0);
3496 }
3497 
3498 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3499 {
3500   PetscErrorCode ierr;
3501   PetscBool      both = PETSC_FALSE;
3502   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3503 
3504   PetscFunctionBegin;
3505   if (A->factortype == MAT_FACTOR_NONE) {
3506     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3507     if (spptr->mat) {
3508       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3509       if (matrix->values) {
3510         both = PETSC_TRUE;
3511         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3512       }
3513     }
3514     if (spptr->matTranspose) {
3515       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3516       if (matrix->values) {
3517         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3518       }
3519     }
3520   }
3521   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3522   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3523   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3524   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3525   else A->offloadmask = PETSC_OFFLOAD_CPU;
3526   PetscFunctionReturn(0);
3527 }
3528 
3529 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3530 {
3531   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3532   PetscErrorCode ierr;
3533 
3534   PetscFunctionBegin;
3535   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3536   if (flg) {
3537     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3538 
3539     A->ops->scale                     = MatScale_SeqAIJ;
3540     A->ops->axpy                      = MatAXPY_SeqAIJ;
3541     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3542     A->ops->mult                      = MatMult_SeqAIJ;
3543     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3544     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3545     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3546     A->ops->multhermitiantranspose    = NULL;
3547     A->ops->multhermitiantransposeadd = NULL;
3548     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3549     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3550     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3551     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3552     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3553     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3554     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3555     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3556     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3557   } else {
3558     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3559     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3560     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3561     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3562     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3563     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3564     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3565     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3566     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3567     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3568     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3569     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3570     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3571     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3572     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3573     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3574     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580   }
3581   A->boundtocpu = flg;
3582   if (flg && a->inode.size) {
3583     a->inode.use = PETSC_TRUE;
3584   } else {
3585     a->inode.use = PETSC_FALSE;
3586   }
3587   PetscFunctionReturn(0);
3588 }
3589 
3590 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3591 {
3592   PetscErrorCode   ierr;
3593   cusparseStatus_t stat;
3594   Mat              B;
3595 
3596   PetscFunctionBegin;
3597   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3598   if (reuse == MAT_INITIAL_MATRIX) {
3599     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3600   } else if (reuse == MAT_REUSE_MATRIX) {
3601     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3602   }
3603   B = *newmat;
3604 
3605   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3606   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3607 
3608   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3609     if (B->factortype == MAT_FACTOR_NONE) {
3610       Mat_SeqAIJCUSPARSE *spptr;
3611       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3612       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3613       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3614       spptr->format     = MAT_CUSPARSE_CSR;
3615      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3616      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3617       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3618      #else
3619       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3620      #endif
3621       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3622       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3623      #endif
3624       B->spptr = spptr;
3625     } else {
3626       Mat_SeqAIJCUSPARSETriFactors *spptr;
3627 
3628       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3629       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3630       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3631       B->spptr = spptr;
3632     }
3633     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3634   }
3635   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3636   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3637   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3638   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3639   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3640   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3641 
3642   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3643   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3644   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3645 #if defined(PETSC_HAVE_HYPRE)
3646   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3647 #endif
3648   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3649   PetscFunctionReturn(0);
3650 }
3651 
3652 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3653 {
3654   PetscErrorCode ierr;
3655 
3656   PetscFunctionBegin;
3657   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3658   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3659   PetscFunctionReturn(0);
3660 }
3661 
3662 /*MC
3663    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3664 
3665    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3666    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3667    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3668 
3669    Options Database Keys:
3670 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3671 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3672 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3673 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3674 
3675   Level: beginner
3676 
3677 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3678 M*/
3679 
3680 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3681 
3682 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3683 {
3684   PetscErrorCode ierr;
3685 
3686   PetscFunctionBegin;
3687   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3688   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3689   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3690   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3691   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3692 
3693   PetscFunctionReturn(0);
3694 }
3695 
3696 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3697 {
3698   PetscErrorCode   ierr;
3699   cusparseStatus_t stat;
3700 
3701   PetscFunctionBegin;
3702   if (*cusparsestruct) {
3703     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3704     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3705     delete (*cusparsestruct)->workVector;
3706     delete (*cusparsestruct)->rowoffsets_gpu;
3707     delete (*cusparsestruct)->cooPerm;
3708     delete (*cusparsestruct)->cooPerm_a;
3709     delete (*cusparsestruct)->csr2csc_i;
3710     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3711     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3712   }
3713   PetscFunctionReturn(0);
3714 }
3715 
3716 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3717 {
3718   PetscFunctionBegin;
3719   if (*mat) {
3720     delete (*mat)->values;
3721     delete (*mat)->column_indices;
3722     delete (*mat)->row_offsets;
3723     delete *mat;
3724     *mat = 0;
3725   }
3726   PetscFunctionReturn(0);
3727 }
3728 
3729 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3730 {
3731   cusparseStatus_t stat;
3732   PetscErrorCode   ierr;
3733 
3734   PetscFunctionBegin;
3735   if (*trifactor) {
3736     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3737     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3738     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3739     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3740     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3741    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3742     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3743    #endif
3744     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3745   }
3746   PetscFunctionReturn(0);
3747 }
3748 
3749 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3750 {
3751   CsrMatrix        *mat;
3752   cusparseStatus_t stat;
3753   cudaError_t      err;
3754 
3755   PetscFunctionBegin;
3756   if (*matstruct) {
3757     if ((*matstruct)->mat) {
3758       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3759        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3760         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3761        #else
3762         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3763         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3764        #endif
3765       } else {
3766         mat = (CsrMatrix*)(*matstruct)->mat;
3767         CsrMatrix_Destroy(&mat);
3768       }
3769     }
3770     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3771     delete (*matstruct)->cprowIndices;
3772     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3773     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3774     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3775 
3776    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3777     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3778     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3779     for (int i=0; i<3; i++) {
3780       if (mdata->cuSpMV[i].initialized) {
3781         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3782         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3783         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3784       }
3785     }
3786    #endif
3787     delete *matstruct;
3788     *matstruct = NULL;
3789   }
3790   PetscFunctionReturn(0);
3791 }
3792 
3793 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3794 {
3795   PetscErrorCode ierr;
3796 
3797   PetscFunctionBegin;
3798   if (*trifactors) {
3799     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3800     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3801     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3802     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3803     delete (*trifactors)->rpermIndices;
3804     delete (*trifactors)->cpermIndices;
3805     delete (*trifactors)->workVector;
3806     (*trifactors)->rpermIndices = NULL;
3807     (*trifactors)->cpermIndices = NULL;
3808     (*trifactors)->workVector = NULL;
3809     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3810     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3811     (*trifactors)->init_dev_prop = PETSC_FALSE;
3812   }
3813   PetscFunctionReturn(0);
3814 }
3815 
3816 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3817 {
3818   PetscErrorCode   ierr;
3819   cusparseHandle_t handle;
3820   cusparseStatus_t stat;
3821 
3822   PetscFunctionBegin;
3823   if (*trifactors) {
3824     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3825     if (handle = (*trifactors)->handle) {
3826       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3827     }
3828     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3829   }
3830   PetscFunctionReturn(0);
3831 }
3832 
3833 struct IJCompare
3834 {
3835   __host__ __device__
3836   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3837   {
3838     if (t1.get<0>() < t2.get<0>()) return true;
3839     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3840     return false;
3841   }
3842 };
3843 
3844 struct IJEqual
3845 {
3846   __host__ __device__
3847   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3848   {
3849     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3850     return true;
3851   }
3852 };
3853 
3854 struct IJDiff
3855 {
3856   __host__ __device__
3857   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3858   {
3859     return t1 == t2 ? 0 : 1;
3860   }
3861 };
3862 
3863 struct IJSum
3864 {
3865   __host__ __device__
3866   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3867   {
3868     return t1||t2;
3869   }
3870 };
3871 
3872 #include <thrust/iterator/discard_iterator.h>
3873 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3874 {
3875   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3876   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3877   THRUSTARRAY                           *cooPerm_v = NULL;
3878   thrust::device_ptr<const PetscScalar> d_v;
3879   CsrMatrix                             *matrix;
3880   PetscErrorCode                        ierr;
3881   PetscInt                              n;
3882 
3883   PetscFunctionBegin;
3884   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3885   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3886   if (!cusp->cooPerm) {
3887     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3888     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3889     PetscFunctionReturn(0);
3890   }
3891   matrix = (CsrMatrix*)cusp->mat->mat;
3892   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3893   if (!v) {
3894     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3895     goto finalize;
3896   }
3897   n = cusp->cooPerm->size();
3898   if (isCudaMem(v)) {
3899     d_v = thrust::device_pointer_cast(v);
3900   } else {
3901     cooPerm_v = new THRUSTARRAY(n);
3902     cooPerm_v->assign(v,v+n);
3903     d_v = cooPerm_v->data();
3904     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3905   }
3906   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3907   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3908     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3909       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3910       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3911       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3912         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3913         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3914       */
3915       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3916       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3917       delete cooPerm_w;
3918     } else {
3919       /* all nonzeros in d_v[] are unique entries */
3920       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3921                                                                 matrix->values->begin()));
3922       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3923                                                                 matrix->values->end()));
3924       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3925     }
3926   } else {
3927     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3928       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3929       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3930     } else {
3931       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3932                                                                 matrix->values->begin()));
3933       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3934                                                                 matrix->values->end()));
3935       thrust::for_each(zibit,zieit,VecCUDAEquals());
3936     }
3937   }
3938   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3939 finalize:
3940   delete cooPerm_v;
3941   A->offloadmask = PETSC_OFFLOAD_GPU;
3942   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3943   /* shorter version of MatAssemblyEnd_SeqAIJ */
3944   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3945   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3946   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3947   a->reallocs         = 0;
3948   A->info.mallocs    += 0;
3949   A->info.nz_unneeded = 0;
3950   A->assembled = A->was_assembled = PETSC_TRUE;
3951   A->num_ass++;
3952   PetscFunctionReturn(0);
3953 }
3954 
3955 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3956 {
3957   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3958   PetscErrorCode     ierr;
3959 
3960   PetscFunctionBegin;
3961   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3962   if (!cusp) PetscFunctionReturn(0);
3963   if (destroy) {
3964     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3965     delete cusp->csr2csc_i;
3966     cusp->csr2csc_i = NULL;
3967   }
3968   A->transupdated = PETSC_FALSE;
3969   PetscFunctionReturn(0);
3970 }
3971 
3972 #include <thrust/binary_search.h>
3973 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3974 {
3975   PetscErrorCode     ierr;
3976   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3977   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3978   PetscInt           cooPerm_n, nzr = 0;
3979   cudaError_t        cerr;
3980 
3981   PetscFunctionBegin;
3982   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3983   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3984   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3985   if (n != cooPerm_n) {
3986     delete cusp->cooPerm;
3987     delete cusp->cooPerm_a;
3988     cusp->cooPerm = NULL;
3989     cusp->cooPerm_a = NULL;
3990   }
3991   if (n) {
3992     THRUSTINTARRAY d_i(n);
3993     THRUSTINTARRAY d_j(n);
3994     THRUSTINTARRAY ii(A->rmap->n);
3995 
3996     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3997     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3998 
3999     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
4000     d_i.assign(coo_i,coo_i+n);
4001     d_j.assign(coo_j,coo_j+n);
4002 
4003     /* Ex.
4004       n = 6
4005       coo_i = [3,3,1,4,1,4]
4006       coo_j = [3,2,2,5,2,6]
4007     */
4008     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4009     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4010 
4011     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4012     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4013     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4014     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4015     THRUSTINTARRAY w = d_j;
4016 
4017     /*
4018       d_i     = [1,1,3,3,4,4]
4019       d_j     = [2,2,2,3,5,6]
4020       cooPerm = [2,4,1,0,3,5]
4021     */
4022     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4023 
4024     /*
4025       d_i     = [1,3,3,4,4,x]
4026                             ^ekey
4027       d_j     = [2,2,3,5,6,x]
4028                            ^nekye
4029     */
4030     if (nekey == ekey) { /* all entries are unique */
4031       delete cusp->cooPerm_a;
4032       cusp->cooPerm_a = NULL;
4033     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4034       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4035       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4036       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4037       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4038       w[0] = 0;
4039       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4040       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4041     }
4042     thrust::counting_iterator<PetscInt> search_begin(0);
4043     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4044                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4045                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4046     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4047 
4048     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4049     a->singlemalloc = PETSC_FALSE;
4050     a->free_a       = PETSC_TRUE;
4051     a->free_ij      = PETSC_TRUE;
4052     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4053     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4054     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4055     a->nz = a->maxnz = a->i[A->rmap->n];
4056     a->rmax = 0;
4057     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4058     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4059     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4060     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4061     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4062     for (PetscInt i = 0; i < A->rmap->n; i++) {
4063       const PetscInt nnzr = a->i[i+1] - a->i[i];
4064       nzr += (PetscInt)!!(nnzr);
4065       a->ilen[i] = a->imax[i] = nnzr;
4066       a->rmax = PetscMax(a->rmax,nnzr);
4067     }
4068     a->nonzerorowcnt = nzr;
4069     A->preallocated = PETSC_TRUE;
4070     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4071     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4072   } else {
4073     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4074   }
4075   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4076 
4077   /* We want to allocate the CUSPARSE struct for matvec now.
4078      The code is so convoluted now that I prefer to copy zeros */
4079   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4080   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4081   A->offloadmask = PETSC_OFFLOAD_CPU;
4082   A->nonzerostate++;
4083   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4084   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4085 
4086   A->assembled = PETSC_FALSE;
4087   A->was_assembled = PETSC_FALSE;
4088   PetscFunctionReturn(0);
4089 }
4090 
4091 /*@C
4092     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4093 
4094    Not collective
4095 
4096     Input Parameters:
4097 +   A - the matrix
4098 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4099 
4100     Output Parameters:
4101 +   ia - the CSR row pointers
4102 -   ja - the CSR column indices
4103 
4104     Level: developer
4105 
4106     Notes:
4107       When compressed is true, the CSR structure does not contain empty rows
4108 
4109 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4110 @*/
4111 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4112 {
4113   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4114   CsrMatrix          *csr;
4115   PetscErrorCode     ierr;
4116   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4117 
4118   PetscFunctionBegin;
4119   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4120   if (!i || !j) PetscFunctionReturn(0);
4121   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4122   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4123   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4124   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4125   csr = (CsrMatrix*)cusp->mat->mat;
4126   if (i) {
4127     if (!compressed && a->compressedrow.use) { /* need full row offset */
4128       if (!cusp->rowoffsets_gpu) {
4129         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4130         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4131         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4132       }
4133       *i = cusp->rowoffsets_gpu->data().get();
4134     } else *i = csr->row_offsets->data().get();
4135   }
4136   if (j) *j = csr->column_indices->data().get();
4137   PetscFunctionReturn(0);
4138 }
4139 
4140 /*@C
4141     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4142 
4143    Not collective
4144 
4145     Input Parameters:
4146 +   A - the matrix
4147 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4148 
4149     Output Parameters:
4150 +   ia - the CSR row pointers
4151 -   ja - the CSR column indices
4152 
4153     Level: developer
4154 
4155 .seealso: MatSeqAIJCUSPARSEGetIJ()
4156 @*/
4157 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4158 {
4159   PetscFunctionBegin;
4160   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4161   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4162   if (i) *i = NULL;
4163   if (j) *j = NULL;
4164   PetscFunctionReturn(0);
4165 }
4166 
4167 /*@C
4168    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4169 
4170    Not Collective
4171 
4172    Input Parameter:
4173 .   A - a MATSEQAIJCUSPARSE matrix
4174 
4175    Output Parameter:
4176 .   a - pointer to the device data
4177 
4178    Level: developer
4179 
4180    Notes: may trigger host-device copies if up-to-date matrix data is on host
4181 
4182 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4183 @*/
4184 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4185 {
4186   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4187   CsrMatrix          *csr;
4188   PetscErrorCode     ierr;
4189 
4190   PetscFunctionBegin;
4191   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4192   PetscValidPointer(a,2);
4193   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4194   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4195   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4196   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4197   csr = (CsrMatrix*)cusp->mat->mat;
4198   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4199   *a = csr->values->data().get();
4200   PetscFunctionReturn(0);
4201 }
4202 
4203 /*@C
4204    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4205 
4206    Not Collective
4207 
4208    Input Parameter:
4209 .   A - a MATSEQAIJCUSPARSE matrix
4210 
4211    Output Parameter:
4212 .   a - pointer to the device data
4213 
4214    Level: developer
4215 
4216 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4217 @*/
4218 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4219 {
4220   PetscFunctionBegin;
4221   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4222   PetscValidPointer(a,2);
4223   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4224   *a = NULL;
4225   PetscFunctionReturn(0);
4226 }
4227 
4228 /*@C
4229    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4230 
4231    Not Collective
4232 
4233    Input Parameter:
4234 .   A - a MATSEQAIJCUSPARSE matrix
4235 
4236    Output Parameter:
4237 .   a - pointer to the device data
4238 
4239    Level: developer
4240 
4241    Notes: may trigger host-device copies if up-to-date matrix data is on host
4242 
4243 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4244 @*/
4245 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4246 {
4247   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4248   CsrMatrix          *csr;
4249   PetscErrorCode     ierr;
4250 
4251   PetscFunctionBegin;
4252   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4253   PetscValidPointer(a,2);
4254   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4255   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4256   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4257   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4258   csr = (CsrMatrix*)cusp->mat->mat;
4259   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4260   *a = csr->values->data().get();
4261   A->offloadmask = PETSC_OFFLOAD_GPU;
4262   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4263   PetscFunctionReturn(0);
4264 }
4265 /*@C
4266    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4267 
4268    Not Collective
4269 
4270    Input Parameter:
4271 .   A - a MATSEQAIJCUSPARSE matrix
4272 
4273    Output Parameter:
4274 .   a - pointer to the device data
4275 
4276    Level: developer
4277 
4278 .seealso: MatSeqAIJCUSPARSEGetArray()
4279 @*/
4280 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4281 {
4282   PetscErrorCode ierr;
4283 
4284   PetscFunctionBegin;
4285   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4286   PetscValidPointer(a,2);
4287   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4288   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4289   *a = NULL;
4290   PetscFunctionReturn(0);
4291 }
4292 
4293 /*@C
4294    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4295 
4296    Not Collective
4297 
4298    Input Parameter:
4299 .   A - a MATSEQAIJCUSPARSE matrix
4300 
4301    Output Parameter:
4302 .   a - pointer to the device data
4303 
4304    Level: developer
4305 
4306    Notes: does not trigger host-device copies and flags data validity on the GPU
4307 
4308 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4309 @*/
4310 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4311 {
4312   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4313   CsrMatrix          *csr;
4314   PetscErrorCode     ierr;
4315 
4316   PetscFunctionBegin;
4317   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4318   PetscValidPointer(a,2);
4319   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4320   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4321   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4322   csr = (CsrMatrix*)cusp->mat->mat;
4323   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4324   *a = csr->values->data().get();
4325   A->offloadmask = PETSC_OFFLOAD_GPU;
4326   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4327   PetscFunctionReturn(0);
4328 }
4329 
4330 /*@C
4331    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4332 
4333    Not Collective
4334 
4335    Input Parameter:
4336 .   A - a MATSEQAIJCUSPARSE matrix
4337 
4338    Output Parameter:
4339 .   a - pointer to the device data
4340 
4341    Level: developer
4342 
4343 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4344 @*/
4345 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4346 {
4347   PetscErrorCode ierr;
4348 
4349   PetscFunctionBegin;
4350   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4351   PetscValidPointer(a,2);
4352   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4353   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4354   *a = NULL;
4355   PetscFunctionReturn(0);
4356 }
4357 
4358 struct IJCompare4
4359 {
4360   __host__ __device__
4361   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4362   {
4363     if (t1.get<0>() < t2.get<0>()) return true;
4364     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4365     return false;
4366   }
4367 };
4368 
4369 struct Shift
4370 {
4371   int _shift;
4372 
4373   Shift(int shift) : _shift(shift) {}
4374   __host__ __device__
4375   inline int operator() (const int &c)
4376   {
4377     return c + _shift;
4378   }
4379 };
4380 
4381 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4382 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4383 {
4384   PetscErrorCode               ierr;
4385   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4386   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4387   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4388   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4389   PetscInt                     Annz,Bnnz;
4390   cusparseStatus_t             stat;
4391   PetscInt                     i,m,n,zero = 0;
4392   cudaError_t                  cerr;
4393 
4394   PetscFunctionBegin;
4395   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4396   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4397   PetscValidPointer(C,4);
4398   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4399   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4400   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4401   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4402   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4403   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4404   if (reuse == MAT_INITIAL_MATRIX) {
4405     m     = A->rmap->n;
4406     n     = A->cmap->n + B->cmap->n;
4407     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4408     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4409     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4410     c     = (Mat_SeqAIJ*)(*C)->data;
4411     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4412     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4413     Ccsr  = new CsrMatrix;
4414     Cmat->cprowIndices      = NULL;
4415     c->compressedrow.use    = PETSC_FALSE;
4416     c->compressedrow.nrows  = 0;
4417     c->compressedrow.i      = NULL;
4418     c->compressedrow.rindex = NULL;
4419     Ccusp->workVector       = NULL;
4420     Ccusp->nrows    = m;
4421     Ccusp->mat      = Cmat;
4422     Ccusp->mat->mat = Ccsr;
4423     Ccsr->num_rows  = m;
4424     Ccsr->num_cols  = n;
4425     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4426     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4427     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4428     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4429     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4430     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4431     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4433     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4434     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4435     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4436     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4437     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4438 
4439     Acsr = (CsrMatrix*)Acusp->mat->mat;
4440     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4441     Annz = (PetscInt)Acsr->column_indices->size();
4442     Bnnz = (PetscInt)Bcsr->column_indices->size();
4443     c->nz = Annz + Bnnz;
4444     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4445     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4446     Ccsr->values = new THRUSTARRAY(c->nz);
4447     Ccsr->num_entries = c->nz;
4448     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4449     if (c->nz) {
4450       auto Acoo = new THRUSTINTARRAY32(Annz);
4451       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4452       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4453       THRUSTINTARRAY32 *Aroff,*Broff;
4454 
4455       if (a->compressedrow.use) { /* need full row offset */
4456         if (!Acusp->rowoffsets_gpu) {
4457           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4458           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4459           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4460         }
4461         Aroff = Acusp->rowoffsets_gpu;
4462       } else Aroff = Acsr->row_offsets;
4463       if (b->compressedrow.use) { /* need full row offset */
4464         if (!Bcusp->rowoffsets_gpu) {
4465           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4466           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4467           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4468         }
4469         Broff = Bcusp->rowoffsets_gpu;
4470       } else Broff = Bcsr->row_offsets;
4471       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4472       stat = cusparseXcsr2coo(Acusp->handle,
4473                               Aroff->data().get(),
4474                               Annz,
4475                               m,
4476                               Acoo->data().get(),
4477                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4478       stat = cusparseXcsr2coo(Bcusp->handle,
4479                               Broff->data().get(),
4480                               Bnnz,
4481                               m,
4482                               Bcoo->data().get(),
4483                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4484       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4485       auto Aperm = thrust::make_constant_iterator(1);
4486       auto Bperm = thrust::make_constant_iterator(0);
4487 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4488       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4489       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4490 #else
4491       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4492       auto Bcib = Bcsr->column_indices->begin();
4493       auto Bcie = Bcsr->column_indices->end();
4494       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4495 #endif
4496       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4497       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4498       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4499       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4500       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4501       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4502       auto p1 = Ccusp->cooPerm->begin();
4503       auto p2 = Ccusp->cooPerm->begin();
4504       thrust::advance(p2,Annz);
4505       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4506 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4507       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4508 #endif
4509       auto cci = thrust::make_counting_iterator(zero);
4510       auto cce = thrust::make_counting_iterator(c->nz);
4511 #if 0 //Errors on SUMMIT cuda 11.1.0
4512       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4513 #else
4514       auto pred = thrust::identity<int>();
4515       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4516       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4517 #endif
4518       stat = cusparseXcoo2csr(Ccusp->handle,
4519                               Ccoo->data().get(),
4520                               c->nz,
4521                               m,
4522                               Ccsr->row_offsets->data().get(),
4523                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4524       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4525       delete wPerm;
4526       delete Acoo;
4527       delete Bcoo;
4528       delete Ccoo;
4529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4530       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4531                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4532                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4533                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4534 #endif
4535       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4536         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4537         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4538         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4539         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4540         CsrMatrix *CcsrT = new CsrMatrix;
4541         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4542         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4543 
4544         (*C)->form_explicit_transpose = PETSC_TRUE;
4545         (*C)->transupdated = PETSC_TRUE;
4546         Ccusp->rowoffsets_gpu = NULL;
4547         CmatT->cprowIndices = NULL;
4548         CmatT->mat = CcsrT;
4549         CcsrT->num_rows = n;
4550         CcsrT->num_cols = m;
4551         CcsrT->num_entries = c->nz;
4552 
4553         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4554         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4555         CcsrT->values = new THRUSTARRAY(c->nz);
4556 
4557         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4558         auto rT = CcsrT->row_offsets->begin();
4559         if (AT) {
4560           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4561           thrust::advance(rT,-1);
4562         }
4563         if (BT) {
4564           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4565           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4566           thrust::copy(titb,tite,rT);
4567         }
4568         auto cT = CcsrT->column_indices->begin();
4569         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4570         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4571         auto vT = CcsrT->values->begin();
4572         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4573         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4574         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4575 
4576         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4577         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4578         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4579         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4580         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4581         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4582         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4584         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4585 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4586         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4587                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4588                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4589                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4590 #endif
4591         Ccusp->matTranspose = CmatT;
4592       }
4593     }
4594 
4595     c->singlemalloc = PETSC_FALSE;
4596     c->free_a       = PETSC_TRUE;
4597     c->free_ij      = PETSC_TRUE;
4598     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4599     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4600     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4601       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4602       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4603       ii   = *Ccsr->row_offsets;
4604       jj   = *Ccsr->column_indices;
4605       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4607     } else {
4608       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4610     }
4611     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4612     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4613     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4614     c->maxnz = c->nz;
4615     c->nonzerorowcnt = 0;
4616     c->rmax = 0;
4617     for (i = 0; i < m; i++) {
4618       const PetscInt nn = c->i[i+1] - c->i[i];
4619       c->ilen[i] = c->imax[i] = nn;
4620       c->nonzerorowcnt += (PetscInt)!!nn;
4621       c->rmax = PetscMax(c->rmax,nn);
4622     }
4623     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4624     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4625     (*C)->nonzerostate++;
4626     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4627     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4628     Ccusp->nonzerostate = (*C)->nonzerostate;
4629     (*C)->preallocated  = PETSC_TRUE;
4630   } else {
4631     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4632     c = (Mat_SeqAIJ*)(*C)->data;
4633     if (c->nz) {
4634       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4635       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4636       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4637       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4638       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4639       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4640       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4641       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4642       Acsr = (CsrMatrix*)Acusp->mat->mat;
4643       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4644       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4645       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4646       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4647       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4648       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4649       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4650       auto pmid = Ccusp->cooPerm->begin();
4651       thrust::advance(pmid,Acsr->num_entries);
4652       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4653       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4654                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4655       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4656                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4657       thrust::for_each(zibait,zieait,VecCUDAEquals());
4658       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4659                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4660       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4661                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4662       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4663       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4664       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4665         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4666         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4667         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4668         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4669         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4670         auto vT = CcsrT->values->begin();
4671         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4672         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4673         (*C)->transupdated = PETSC_TRUE;
4674       }
4675       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4676     }
4677   }
4678   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4679   (*C)->assembled     = PETSC_TRUE;
4680   (*C)->was_assembled = PETSC_FALSE;
4681   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4682   PetscFunctionReturn(0);
4683 }
4684 
4685 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4686 {
4687   PetscErrorCode    ierr;
4688   bool              dmem;
4689   const PetscScalar *av;
4690   cudaError_t       cerr;
4691 
4692   PetscFunctionBegin;
4693   dmem = isCudaMem(v);
4694   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4695   if (n && idx) {
4696     THRUSTINTARRAY widx(n);
4697     widx.assign(idx,idx+n);
4698     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4699 
4700     THRUSTARRAY *w = NULL;
4701     thrust::device_ptr<PetscScalar> dv;
4702     if (dmem) {
4703       dv = thrust::device_pointer_cast(v);
4704     } else {
4705       w = new THRUSTARRAY(n);
4706       dv = w->data();
4707     }
4708     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4709 
4710     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4711     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4712     thrust::for_each(zibit,zieit,VecCUDAEquals());
4713     if (w) {
4714       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4715     }
4716     delete w;
4717   } else {
4718     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4719   }
4720   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4721   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4722   PetscFunctionReturn(0);
4723 }
4724