xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 5efe38cc0ffffa5b2a9c1209dc975e6dbd30a790)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     if (!A->boundtocpu) {
169       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
170       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
171     } else {
172       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
173       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
174     }
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
179     if (!A->boundtocpu) {
180       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
181       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
182     } else {
183       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
184       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
185     }
186     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
187     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
188   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
189 
190   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
191   (*B)->canuseordering = PETSC_TRUE;
192   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
197 {
198   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
199 
200   PetscFunctionBegin;
201   switch (op) {
202   case MAT_CUSPARSE_MULT:
203     cusparsestruct->format = format;
204     break;
205   case MAT_CUSPARSE_ALL:
206     cusparsestruct->format = format;
207     break;
208   default:
209     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
210   }
211   PetscFunctionReturn(0);
212 }
213 
214 /*@
215    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
216    operation. Only the MatMult operation can use different GPU storage formats
217    for MPIAIJCUSPARSE matrices.
218    Not Collective
219 
220    Input Parameters:
221 +  A - Matrix of type SEQAIJCUSPARSE
222 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
223 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
224 
225    Output Parameter:
226 
227    Level: intermediate
228 
229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
230 @*/
231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
237   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
242 {
243   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
244 
245   PetscFunctionBegin;
246   cusparsestruct->use_cpu_solve = use_cpu;
247   PetscFunctionReturn(0);
248 }
249 
250 /*@
251    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
252 
253    Input Parameters:
254 +  A - Matrix of type SEQAIJCUSPARSE
255 -  use_cpu - set flag for using the built-in CPU MatSolve
256 
257    Output Parameter:
258 
259    Notes:
260    The cuSparse LU solver currently computes the factors with the built-in CPU method
261    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
262    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
263 
264    Level: intermediate
265 
266 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
267 @*/
268 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
269 {
270   PetscErrorCode ierr;
271 
272   PetscFunctionBegin;
273   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
274   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
275   PetscFunctionReturn(0);
276 }
277 
278 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
279 {
280   PetscErrorCode ierr;
281 
282   PetscFunctionBegin;
283   switch (op) {
284     case MAT_FORM_EXPLICIT_TRANSPOSE:
285       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
286       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
287       A->form_explicit_transpose = flg;
288       break;
289     default:
290       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
291       break;
292   }
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
297 
298 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
299 {
300   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
301   IS             isrow = b->row,iscol = b->col;
302   PetscBool      row_identity,col_identity;
303   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
304   PetscErrorCode ierr;
305 
306   PetscFunctionBegin;
307   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
308   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
309   B->offloadmask = PETSC_OFFLOAD_CPU;
310   /* determine which version of MatSolve needs to be used. */
311   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
312   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
313   if (row_identity && col_identity) {
314     if (!cusparsestruct->use_cpu_solve) {
315       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
316       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
317     }
318     B->ops->matsolve = NULL;
319     B->ops->matsolvetranspose = NULL;
320   } else {
321     if (!cusparsestruct->use_cpu_solve) {
322       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
323       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
324     }
325     B->ops->matsolve = NULL;
326     B->ops->matsolvetranspose = NULL;
327   }
328 
329   /* get the triangular factors */
330   if (!cusparsestruct->use_cpu_solve) {
331     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
332   }
333   PetscFunctionReturn(0);
334 }
335 
336 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
337 {
338   PetscErrorCode           ierr;
339   MatCUSPARSEStorageFormat format;
340   PetscBool                flg;
341   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
342 
343   PetscFunctionBegin;
344   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
345   if (A->factortype == MAT_FACTOR_NONE) {
346     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
347                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
348     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
349 
350     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
351                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
352     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
353     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
354     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
356     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
357                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
358     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
359 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
360     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
361 #else
362     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
363 #endif
364     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
365                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
366     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
367 
368     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
369                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
370     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
371    #endif
372   }
373   ierr = PetscOptionsTail();CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
378 {
379   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
380   PetscErrorCode               ierr;
381 
382   PetscFunctionBegin;
383   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
384   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
385   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
386   PetscFunctionReturn(0);
387 }
388 
389 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
390 {
391   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
392   PetscErrorCode               ierr;
393 
394   PetscFunctionBegin;
395   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
396   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
397   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
398   PetscFunctionReturn(0);
399 }
400 
401 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
402 {
403   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
404   PetscErrorCode               ierr;
405 
406   PetscFunctionBegin;
407   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
408   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
409   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410   PetscFunctionReturn(0);
411 }
412 
413 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
414 {
415   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
416   PetscErrorCode               ierr;
417 
418   PetscFunctionBegin;
419   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
420   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
421   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
422   PetscFunctionReturn(0);
423 }
424 
425 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
426 {
427   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
428   PetscInt                          n = A->rmap->n;
429   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
430   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
431   cusparseStatus_t                  stat;
432   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
433   const MatScalar                   *aa = a->a,*v;
434   PetscInt                          *AiLo, *AjLo;
435   PetscInt                          i,nz, nzLower, offset, rowOffset;
436   PetscErrorCode                    ierr;
437   cudaError_t                       cerr;
438 
439   PetscFunctionBegin;
440   if (!n) PetscFunctionReturn(0);
441   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
442     try {
443       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
444       nzLower=n+ai[n]-ai[1];
445       if (!loTriFactor) {
446         PetscScalar                       *AALo;
447 
448         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
449 
450         /* Allocate Space for the lower triangular matrix */
451         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
452         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
453 
454         /* Fill the lower triangular matrix */
455         AiLo[0]  = (PetscInt) 0;
456         AiLo[n]  = nzLower;
457         AjLo[0]  = (PetscInt) 0;
458         AALo[0]  = (MatScalar) 1.0;
459         v        = aa;
460         vi       = aj;
461         offset   = 1;
462         rowOffset= 1;
463         for (i=1; i<n; i++) {
464           nz = ai[i+1] - ai[i];
465           /* additional 1 for the term on the diagonal */
466           AiLo[i]    = rowOffset;
467           rowOffset += nz+1;
468 
469           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
470           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
471 
472           offset      += nz;
473           AjLo[offset] = (PetscInt) i;
474           AALo[offset] = (MatScalar) 1.0;
475           offset      += 1;
476 
477           v  += nz;
478           vi += nz;
479         }
480 
481         /* allocate space for the triangular factor information */
482         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
483         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
484         /* Create the matrix description */
485         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
486         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
487        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
488         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
489        #else
490         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
491        #endif
492         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
493         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
494 
495         /* set the operation */
496         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
497 
498         /* set the matrix */
499         loTriFactor->csrMat = new CsrMatrix;
500         loTriFactor->csrMat->num_rows = n;
501         loTriFactor->csrMat->num_cols = n;
502         loTriFactor->csrMat->num_entries = nzLower;
503 
504         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
505         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
506 
507         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
508         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
509 
510         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
511         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
512 
513         /* Create the solve analysis information */
514         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
515         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
516       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
517         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
518                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
519                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
520                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
521                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
522         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
523       #endif
524 
525         /* perform the solve analysis */
526         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
527                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
528                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
529                                  loTriFactor->csrMat->column_indices->data().get(),
530                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
531                                  loTriFactor->solveInfo,
532                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
533                                #else
534                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
535                                #endif
536         cerr = WaitForCUDA();CHKERRCUDA(cerr);
537         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
538 
539         /* assign the pointer */
540         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
541         loTriFactor->AA_h = AALo;
542         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
543         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
544         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
545       } else { /* update values only */
546         if (!loTriFactor->AA_h) {
547           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
548         }
549         /* Fill the lower triangular matrix */
550         loTriFactor->AA_h[0]  = 1.0;
551         v        = aa;
552         vi       = aj;
553         offset   = 1;
554         for (i=1; i<n; i++) {
555           nz = ai[i+1] - ai[i];
556           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
557           offset      += nz;
558           loTriFactor->AA_h[offset] = 1.0;
559           offset      += 1;
560           v  += nz;
561         }
562         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
563         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
564       }
565     } catch(char *ex) {
566       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
567     }
568   }
569   PetscFunctionReturn(0);
570 }
571 
572 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
573 {
574   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
575   PetscInt                          n = A->rmap->n;
576   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
577   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
578   cusparseStatus_t                  stat;
579   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
580   const MatScalar                   *aa = a->a,*v;
581   PetscInt                          *AiUp, *AjUp;
582   PetscInt                          i,nz, nzUpper, offset;
583   PetscErrorCode                    ierr;
584   cudaError_t                       cerr;
585 
586   PetscFunctionBegin;
587   if (!n) PetscFunctionReturn(0);
588   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
589     try {
590       /* next, figure out the number of nonzeros in the upper triangular matrix. */
591       nzUpper = adiag[0]-adiag[n];
592       if (!upTriFactor) {
593         PetscScalar *AAUp;
594 
595         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
596 
597         /* Allocate Space for the upper triangular matrix */
598         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
599         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
600 
601         /* Fill the upper triangular matrix */
602         AiUp[0]=(PetscInt) 0;
603         AiUp[n]=nzUpper;
604         offset = nzUpper;
605         for (i=n-1; i>=0; i--) {
606           v  = aa + adiag[i+1] + 1;
607           vi = aj + adiag[i+1] + 1;
608 
609           /* number of elements NOT on the diagonal */
610           nz = adiag[i] - adiag[i+1]-1;
611 
612           /* decrement the offset */
613           offset -= (nz+1);
614 
615           /* first, set the diagonal elements */
616           AjUp[offset] = (PetscInt) i;
617           AAUp[offset] = (MatScalar)1./v[nz];
618           AiUp[i]      = AiUp[i+1] - (nz+1);
619 
620           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
621           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
622         }
623 
624         /* allocate space for the triangular factor information */
625         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
626         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
627 
628         /* Create the matrix description */
629         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
630         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
631        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
632         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
633        #else
634         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
635        #endif
636         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
637         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
638 
639         /* set the operation */
640         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
641 
642         /* set the matrix */
643         upTriFactor->csrMat = new CsrMatrix;
644         upTriFactor->csrMat->num_rows = n;
645         upTriFactor->csrMat->num_cols = n;
646         upTriFactor->csrMat->num_entries = nzUpper;
647 
648         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
649         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
650 
651         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
652         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
653 
654         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
655         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
656 
657         /* Create the solve analysis information */
658         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
659         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
660       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
661         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
662                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
663                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
664                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
665                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
666         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
667       #endif
668 
669         /* perform the solve analysis */
670         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
671                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
672                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
673                                  upTriFactor->csrMat->column_indices->data().get(),
674                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
675                                  upTriFactor->solveInfo,
676                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
677                                #else
678                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
679                                #endif
680         cerr = WaitForCUDA();CHKERRCUDA(cerr);
681         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
682 
683         /* assign the pointer */
684         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
685         upTriFactor->AA_h = AAUp;
686         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
687         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
688         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
689       } else {
690         if (!upTriFactor->AA_h) {
691           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692         }
693         /* Fill the upper triangular matrix */
694         offset = nzUpper;
695         for (i=n-1; i>=0; i--) {
696           v  = aa + adiag[i+1] + 1;
697 
698           /* number of elements NOT on the diagonal */
699           nz = adiag[i] - adiag[i+1]-1;
700 
701           /* decrement the offset */
702           offset -= (nz+1);
703 
704           /* first, set the diagonal elements */
705           upTriFactor->AA_h[offset] = 1./v[nz];
706           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
707         }
708         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
709         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
710       }
711     } catch(char *ex) {
712       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
713     }
714   }
715   PetscFunctionReturn(0);
716 }
717 
718 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
719 {
720   PetscErrorCode               ierr;
721   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
722   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
723   IS                           isrow = a->row,iscol = a->icol;
724   PetscBool                    row_identity,col_identity;
725   PetscInt                     n = A->rmap->n;
726 
727   PetscFunctionBegin;
728   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
729   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
730   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
731 
732   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
733   cusparseTriFactors->nnz=a->nz;
734 
735   A->offloadmask = PETSC_OFFLOAD_BOTH;
736   /* lower triangular indices */
737   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
738   if (!row_identity && !cusparseTriFactors->rpermIndices) {
739     const PetscInt *r;
740 
741     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
742     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
743     cusparseTriFactors->rpermIndices->assign(r, r+n);
744     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
745     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
746   }
747 
748   /* upper triangular indices */
749   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
750   if (!col_identity && !cusparseTriFactors->cpermIndices) {
751     const PetscInt *c;
752 
753     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
754     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
755     cusparseTriFactors->cpermIndices->assign(c, c+n);
756     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
757     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
758   }
759   PetscFunctionReturn(0);
760 }
761 
762 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
763 {
764   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
765   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
766   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
767   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
768   cusparseStatus_t                  stat;
769   PetscErrorCode                    ierr;
770   cudaError_t                       cerr;
771   PetscInt                          *AiUp, *AjUp;
772   PetscScalar                       *AAUp;
773   PetscScalar                       *AALo;
774   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
775   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
776   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
777   const MatScalar                   *aa = b->a,*v;
778 
779   PetscFunctionBegin;
780   if (!n) PetscFunctionReturn(0);
781   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
782     try {
783       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
784       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
785       if (!upTriFactor && !loTriFactor) {
786         /* Allocate Space for the upper triangular matrix */
787         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
788         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
789 
790         /* Fill the upper triangular matrix */
791         AiUp[0]=(PetscInt) 0;
792         AiUp[n]=nzUpper;
793         offset = 0;
794         for (i=0; i<n; i++) {
795           /* set the pointers */
796           v  = aa + ai[i];
797           vj = aj + ai[i];
798           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
799 
800           /* first, set the diagonal elements */
801           AjUp[offset] = (PetscInt) i;
802           AAUp[offset] = (MatScalar)1.0/v[nz];
803           AiUp[i]      = offset;
804           AALo[offset] = (MatScalar)1.0/v[nz];
805 
806           offset+=1;
807           if (nz>0) {
808             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
809             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
810             for (j=offset; j<offset+nz; j++) {
811               AAUp[j] = -AAUp[j];
812               AALo[j] = AAUp[j]/v[nz];
813             }
814             offset+=nz;
815           }
816         }
817 
818         /* allocate space for the triangular factor information */
819         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
820         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
821 
822         /* Create the matrix description */
823         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
824         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
825        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
826         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
827        #else
828         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
829        #endif
830         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
831         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
832 
833         /* set the matrix */
834         upTriFactor->csrMat = new CsrMatrix;
835         upTriFactor->csrMat->num_rows = A->rmap->n;
836         upTriFactor->csrMat->num_cols = A->cmap->n;
837         upTriFactor->csrMat->num_entries = a->nz;
838 
839         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
847 
848         /* set the operation */
849         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
850 
851         /* Create the solve analysis information */
852         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
853         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
854       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
855         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
856                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
857                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
858                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
859                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
860         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
861       #endif
862 
863         /* perform the solve analysis */
864         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
865                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
866                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
867                                  upTriFactor->csrMat->column_indices->data().get(),
868                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
869                                  upTriFactor->solveInfo,
870                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
871                                 #else
872                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
873                                 #endif
874         cerr = WaitForCUDA();CHKERRCUDA(cerr);
875         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
876 
877         /* assign the pointer */
878         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
879 
880         /* allocate space for the triangular factor information */
881         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
882         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
883 
884         /* Create the matrix description */
885         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
886         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
887        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
888         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
889        #else
890         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
891        #endif
892         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
893         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
894 
895         /* set the operation */
896         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
897 
898         /* set the matrix */
899         loTriFactor->csrMat = new CsrMatrix;
900         loTriFactor->csrMat->num_rows = A->rmap->n;
901         loTriFactor->csrMat->num_cols = A->cmap->n;
902         loTriFactor->csrMat->num_entries = a->nz;
903 
904         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
906 
907         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
908         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
909 
910         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
911         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
912 
913         /* Create the solve analysis information */
914         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
915         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
916       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
917         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
918                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
919                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
920                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
921                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
922         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
923       #endif
924 
925         /* perform the solve analysis */
926         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
927                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
928                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
929                                  loTriFactor->csrMat->column_indices->data().get(),
930                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
931                                  loTriFactor->solveInfo,
932                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
933                                 #else
934                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
935                                 #endif
936         cerr = WaitForCUDA();CHKERRCUDA(cerr);
937         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
938 
939         /* assign the pointer */
940         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
941 
942         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
943         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
944         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
945       } else {
946         /* Fill the upper triangular matrix */
947         offset = 0;
948         for (i=0; i<n; i++) {
949           /* set the pointers */
950           v  = aa + ai[i];
951           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
952 
953           /* first, set the diagonal elements */
954           AAUp[offset] = 1.0/v[nz];
955           AALo[offset] = 1.0/v[nz];
956 
957           offset+=1;
958           if (nz>0) {
959             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
960             for (j=offset; j<offset+nz; j++) {
961               AAUp[j] = -AAUp[j];
962               AALo[j] = AAUp[j]/v[nz];
963             }
964             offset+=nz;
965           }
966         }
967         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
968         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
969         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
970         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
971         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
972       }
973       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
974       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
975     } catch(char *ex) {
976       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
977     }
978   }
979   PetscFunctionReturn(0);
980 }
981 
982 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
983 {
984   PetscErrorCode               ierr;
985   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
986   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
987   IS                           ip = a->row;
988   PetscBool                    perm_identity;
989   PetscInt                     n = A->rmap->n;
990 
991   PetscFunctionBegin;
992   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
993   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
994   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
995   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
996 
997   A->offloadmask = PETSC_OFFLOAD_BOTH;
998 
999   /* lower triangular indices */
1000   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1001   if (!perm_identity) {
1002     IS             iip;
1003     const PetscInt *irip,*rip;
1004 
1005     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1006     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1007     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1008     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1009     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1010     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1011     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1012     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1013     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1014     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1015     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1016   }
1017   PetscFunctionReturn(0);
1018 }
1019 
1020 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1021 {
1022   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1023   IS             ip = b->row;
1024   PetscBool      perm_identity;
1025   PetscErrorCode ierr;
1026 
1027   PetscFunctionBegin;
1028   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1029   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1030   B->offloadmask = PETSC_OFFLOAD_CPU;
1031   /* determine which version of MatSolve needs to be used. */
1032   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1033   if (perm_identity) {
1034     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1036     B->ops->matsolve = NULL;
1037     B->ops->matsolvetranspose = NULL;
1038   } else {
1039     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1040     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1041     B->ops->matsolve = NULL;
1042     B->ops->matsolvetranspose = NULL;
1043   }
1044 
1045   /* get the triangular factors */
1046   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1051 {
1052   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1055   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1056   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1057   cusparseStatus_t                  stat;
1058   cusparseIndexBase_t               indexBase;
1059   cusparseMatrixType_t              matrixType;
1060   cusparseFillMode_t                fillMode;
1061   cusparseDiagType_t                diagType;
1062   cudaError_t                       cerr;
1063   PetscErrorCode                    ierr;
1064 
1065   PetscFunctionBegin;
1066   /* allocate space for the transpose of the lower triangular factor */
1067   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1068   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1069 
1070   /* set the matrix descriptors of the lower triangular factor */
1071   matrixType = cusparseGetMatType(loTriFactor->descr);
1072   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1073   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1074     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1075   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1076 
1077   /* Create the matrix description */
1078   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1079   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1080   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1081   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1082   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1083 
1084   /* set the operation */
1085   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1086 
1087   /* allocate GPU space for the CSC of the lower triangular factor*/
1088   loTriFactorT->csrMat = new CsrMatrix;
1089   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1090   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1091   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1092   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1093   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1094   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1095 
1096   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1098   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1099                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1100                                        loTriFactor->csrMat->values->data().get(),
1101                                        loTriFactor->csrMat->row_offsets->data().get(),
1102                                        loTriFactor->csrMat->column_indices->data().get(),
1103                                        loTriFactorT->csrMat->values->data().get(),
1104                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1105                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1106                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1107   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1108 #endif
1109 
1110   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1111   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1112                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1113                           loTriFactor->csrMat->values->data().get(),
1114                           loTriFactor->csrMat->row_offsets->data().get(),
1115                           loTriFactor->csrMat->column_indices->data().get(),
1116                           loTriFactorT->csrMat->values->data().get(),
1117                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1118                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1119                           CUSPARSE_ACTION_NUMERIC, indexBase,
1120                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1121                         #else
1122                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1123                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1124                         #endif
1125   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1126   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1127 
1128   /* Create the solve analysis information */
1129   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1130   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1131 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1132   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1133                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1134                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1135                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1136                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1137   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1138 #endif
1139 
1140   /* perform the solve analysis */
1141   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1142                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1143                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1144                            loTriFactorT->csrMat->column_indices->data().get(),
1145                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1146                            loTriFactorT->solveInfo,
1147                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1148                           #else
1149                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1150                           #endif
1151   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1152   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1153 
1154   /* assign the pointer */
1155   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1156 
1157   /*********************************************/
1158   /* Now the Transpose of the Upper Tri Factor */
1159   /*********************************************/
1160 
1161   /* allocate space for the transpose of the upper triangular factor */
1162   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1163   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1164 
1165   /* set the matrix descriptors of the upper triangular factor */
1166   matrixType = cusparseGetMatType(upTriFactor->descr);
1167   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1168   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1169     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1170   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1171 
1172   /* Create the matrix description */
1173   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1174   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1175   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1176   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1177   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1178 
1179   /* set the operation */
1180   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1181 
1182   /* allocate GPU space for the CSC of the upper triangular factor*/
1183   upTriFactorT->csrMat = new CsrMatrix;
1184   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1185   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1186   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1187   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1188   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1189   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1190 
1191   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1192 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1193   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1194                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1195                                 upTriFactor->csrMat->values->data().get(),
1196                                 upTriFactor->csrMat->row_offsets->data().get(),
1197                                 upTriFactor->csrMat->column_indices->data().get(),
1198                                 upTriFactorT->csrMat->values->data().get(),
1199                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1200                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1201                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1202   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1203 #endif
1204 
1205   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1206   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1207                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1208                           upTriFactor->csrMat->values->data().get(),
1209                           upTriFactor->csrMat->row_offsets->data().get(),
1210                           upTriFactor->csrMat->column_indices->data().get(),
1211                           upTriFactorT->csrMat->values->data().get(),
1212                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1213                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1214                           CUSPARSE_ACTION_NUMERIC, indexBase,
1215                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1216                         #else
1217                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1218                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1219                         #endif
1220 
1221   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1222   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1223 
1224   /* Create the solve analysis information */
1225   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1226   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1227   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1228   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1229                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1230                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1231                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1232                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1233   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1234   #endif
1235 
1236   /* perform the solve analysis */
1237   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1238                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1239                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1240                            upTriFactorT->csrMat->column_indices->data().get(),
1241                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1242                            upTriFactorT->solveInfo,
1243                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1244                           #else
1245                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1246                           #endif
1247 
1248   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1249   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1250 
1251   /* assign the pointer */
1252   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 struct PetscScalarToPetscInt
1257 {
1258   __host__ __device__
1259   PetscInt operator()(PetscScalar s)
1260   {
1261     return (PetscInt)PetscRealPart(s);
1262   }
1263 };
1264 
1265 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1266 {
1267   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1268   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1269   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1270   cusparseStatus_t             stat;
1271   cusparseIndexBase_t          indexBase;
1272   cudaError_t                  err;
1273   PetscErrorCode               ierr;
1274 
1275   PetscFunctionBegin;
1276   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1277   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1278   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1279   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1280   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1281   if (A->transupdated) PetscFunctionReturn(0);
1282   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1283   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1284   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1285     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1286   }
1287   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1288     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1289     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1290     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1291     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1292     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1293 
1294     /* set alpha and beta */
1295     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1296     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1297     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1298     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1299     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1300     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1301 
1302     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1303       CsrMatrix *matrixT = new CsrMatrix;
1304       matstructT->mat = matrixT;
1305       matrixT->num_rows = A->cmap->n;
1306       matrixT->num_cols = A->rmap->n;
1307       matrixT->num_entries = a->nz;
1308       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1309       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1310       matrixT->values = new THRUSTARRAY(a->nz);
1311 
1312       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1313       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1314 
1315      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1316       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1317         stat = cusparseCreateCsr(&matstructT->matDescr,
1318                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1319                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1320                                matrixT->values->data().get(),
1321                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1322                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1323       #else
1324         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1325            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1326 
1327            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1328            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1329            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1330         */
1331         if (matrixT->num_entries) {
1332           stat = cusparseCreateCsr(&matstructT->matDescr,
1333                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1334                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1335                                  matrixT->values->data().get(),
1336                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1337                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1338 
1339         } else {
1340           matstructT->matDescr = NULL;
1341           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1342         }
1343       #endif
1344      #endif
1345     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1346    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1347       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1348    #else
1349       CsrMatrix *temp  = new CsrMatrix;
1350       CsrMatrix *tempT = new CsrMatrix;
1351       /* First convert HYB to CSR */
1352       temp->num_rows = A->rmap->n;
1353       temp->num_cols = A->cmap->n;
1354       temp->num_entries = a->nz;
1355       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1356       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1357       temp->values = new THRUSTARRAY(a->nz);
1358 
1359       stat = cusparse_hyb2csr(cusparsestruct->handle,
1360                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1361                               temp->values->data().get(),
1362                               temp->row_offsets->data().get(),
1363                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1364 
1365       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1366       tempT->num_rows = A->rmap->n;
1367       tempT->num_cols = A->cmap->n;
1368       tempT->num_entries = a->nz;
1369       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1370       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1371       tempT->values = new THRUSTARRAY(a->nz);
1372 
1373       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1374                               temp->num_cols, temp->num_entries,
1375                               temp->values->data().get(),
1376                               temp->row_offsets->data().get(),
1377                               temp->column_indices->data().get(),
1378                               tempT->values->data().get(),
1379                               tempT->column_indices->data().get(),
1380                               tempT->row_offsets->data().get(),
1381                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1382 
1383       /* Last, convert CSC to HYB */
1384       cusparseHybMat_t hybMat;
1385       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1386       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1387         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1388       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1389                               matstructT->descr, tempT->values->data().get(),
1390                               tempT->row_offsets->data().get(),
1391                               tempT->column_indices->data().get(),
1392                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1393 
1394       /* assign the pointer */
1395       matstructT->mat = hybMat;
1396       A->transupdated = PETSC_TRUE;
1397       /* delete temporaries */
1398       if (tempT) {
1399         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1400         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1401         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1402         delete (CsrMatrix*) tempT;
1403       }
1404       if (temp) {
1405         if (temp->values) delete (THRUSTARRAY*) temp->values;
1406         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1407         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1408         delete (CsrMatrix*) temp;
1409       }
1410      #endif
1411     }
1412   }
1413   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1414     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1415     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1416     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1417     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1418     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1419     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1420     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1421     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1422     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1423     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1424     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1425       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1426       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1427       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1428     }
1429     if (!cusparsestruct->csr2csc_i) {
1430       THRUSTARRAY csr2csc_a(matrix->num_entries);
1431       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1432 
1433       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1434      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1435       void   *csr2cscBuffer;
1436       size_t csr2cscBufferSize;
1437       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1438                                            A->cmap->n, matrix->num_entries,
1439                                            matrix->values->data().get(),
1440                                            cusparsestruct->rowoffsets_gpu->data().get(),
1441                                            matrix->column_indices->data().get(),
1442                                            matrixT->values->data().get(),
1443                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1444                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1445                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1446       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1447      #endif
1448 
1449       if (matrix->num_entries) {
1450         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1451            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1452            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1453 
1454            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1455            should be filled with indexBase. So I just take a shortcut here.
1456         */
1457         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1458                               A->cmap->n,matrix->num_entries,
1459                               csr2csc_a.data().get(),
1460                               cusparsestruct->rowoffsets_gpu->data().get(),
1461                               matrix->column_indices->data().get(),
1462                               matrixT->values->data().get(),
1463                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1464                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1465                               CUSPARSE_ACTION_NUMERIC,indexBase,
1466                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1467                              #else
1468                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1469                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1470                              #endif
1471       } else {
1472         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1473       }
1474 
1475       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1476       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1477      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1478       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1479      #endif
1480     }
1481     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1482                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1483                                                      matrixT->values->begin()));
1484   }
1485   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1486   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1487   /* the compressed row indices is not used for matTranspose */
1488   matstructT->cprowIndices = NULL;
1489   /* assign the pointer */
1490   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1491   A->transupdated = PETSC_TRUE;
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1496 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1497 {
1498   PetscInt                              n = xx->map->n;
1499   const PetscScalar                     *barray;
1500   PetscScalar                           *xarray;
1501   thrust::device_ptr<const PetscScalar> bGPU;
1502   thrust::device_ptr<PetscScalar>       xGPU;
1503   cusparseStatus_t                      stat;
1504   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1505   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1506   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1507   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1508   PetscErrorCode                        ierr;
1509 
1510   PetscFunctionBegin;
1511   /* Analyze the matrix and create the transpose ... on the fly */
1512   if (!loTriFactorT && !upTriFactorT) {
1513     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1514     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1515     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1516   }
1517 
1518   /* Get the GPU pointers */
1519   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1520   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1521   xGPU = thrust::device_pointer_cast(xarray);
1522   bGPU = thrust::device_pointer_cast(barray);
1523 
1524   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1525   /* First, reorder with the row permutation */
1526   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1527                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1528                xGPU);
1529 
1530   /* First, solve U */
1531   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1532                         upTriFactorT->csrMat->num_rows,
1533                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534                         upTriFactorT->csrMat->num_entries,
1535                       #endif
1536                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1537                         upTriFactorT->csrMat->values->data().get(),
1538                         upTriFactorT->csrMat->row_offsets->data().get(),
1539                         upTriFactorT->csrMat->column_indices->data().get(),
1540                         upTriFactorT->solveInfo,
1541                         xarray,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         tempGPU->data().get(),
1544                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1545                       #else
1546                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1547                       #endif
1548 
1549   /* Then, solve L */
1550   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551                         loTriFactorT->csrMat->num_rows,
1552                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553                         loTriFactorT->csrMat->num_entries,
1554                       #endif
1555                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556                         loTriFactorT->csrMat->values->data().get(),
1557                         loTriFactorT->csrMat->row_offsets->data().get(),
1558                         loTriFactorT->csrMat->column_indices->data().get(),
1559                         loTriFactorT->solveInfo,
1560                         tempGPU->data().get(),
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         xarray,
1563                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1564                       #else
1565                          xarray);CHKERRCUSPARSE(stat);
1566                       #endif
1567 
1568   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1569   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1570                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1571                tempGPU->begin());
1572 
1573   /* Copy the temporary to the full solution. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1575 
1576   /* restore */
1577   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1578   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1579   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1580   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1585 {
1586   const PetscScalar                 *barray;
1587   PetscScalar                       *xarray;
1588   cusparseStatus_t                  stat;
1589   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1590   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1592   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1593   PetscErrorCode                    ierr;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1599     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1605   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1606 
1607   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1608   /* First, solve U */
1609   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1610                         upTriFactorT->csrMat->num_rows,
1611                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612                         upTriFactorT->csrMat->num_entries,
1613                       #endif
1614                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1615                         upTriFactorT->csrMat->values->data().get(),
1616                         upTriFactorT->csrMat->row_offsets->data().get(),
1617                         upTriFactorT->csrMat->column_indices->data().get(),
1618                         upTriFactorT->solveInfo,
1619                         barray,
1620                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1621                         tempGPU->data().get(),
1622                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1623                       #else
1624                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1625                       #endif
1626 
1627   /* Then, solve L */
1628   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1629                         loTriFactorT->csrMat->num_rows,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         loTriFactorT->csrMat->num_entries,
1632                       #endif
1633                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1634                         loTriFactorT->csrMat->values->data().get(),
1635                         loTriFactorT->csrMat->row_offsets->data().get(),
1636                         loTriFactorT->csrMat->column_indices->data().get(),
1637                         loTriFactorT->solveInfo,
1638                         tempGPU->data().get(),
1639                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1640                         xarray,
1641                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1642                       #else
1643                         xarray);CHKERRCUSPARSE(stat);
1644                       #endif
1645 
1646   /* restore */
1647   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1648   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1649   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1650   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1655 {
1656   const PetscScalar                     *barray;
1657   PetscScalar                           *xarray;
1658   thrust::device_ptr<const PetscScalar> bGPU;
1659   thrust::device_ptr<PetscScalar>       xGPU;
1660   cusparseStatus_t                      stat;
1661   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1662   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1663   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1664   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1665   PetscErrorCode                        ierr;
1666 
1667   PetscFunctionBegin;
1668 
1669   /* Get the GPU pointers */
1670   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1671   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1672   xGPU = thrust::device_pointer_cast(xarray);
1673   bGPU = thrust::device_pointer_cast(barray);
1674 
1675   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1676   /* First, reorder with the row permutation */
1677   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1678                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1679                tempGPU->begin());
1680 
1681   /* Next, solve L */
1682   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1683                         loTriFactor->csrMat->num_rows,
1684                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1685                         loTriFactor->csrMat->num_entries,
1686                       #endif
1687                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1688                         loTriFactor->csrMat->values->data().get(),
1689                         loTriFactor->csrMat->row_offsets->data().get(),
1690                         loTriFactor->csrMat->column_indices->data().get(),
1691                         loTriFactor->solveInfo,
1692                         tempGPU->data().get(),
1693                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694                          xarray,
1695                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1696                       #else
1697                          xarray);CHKERRCUSPARSE(stat);
1698                       #endif
1699 
1700   /* Then, solve U */
1701   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1702                         upTriFactor->csrMat->num_rows,
1703                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1704                         upTriFactor->csrMat->num_entries,
1705                       #endif
1706                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1707                         upTriFactor->csrMat->values->data().get(),
1708                         upTriFactor->csrMat->row_offsets->data().get(),
1709                         upTriFactor->csrMat->column_indices->data().get(),
1710                         upTriFactor->solveInfo,xarray,
1711                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1712                         tempGPU->data().get(),
1713                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1714                       #else
1715                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1716                       #endif
1717 
1718   /* Last, reorder with the column permutation */
1719   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1720                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1721                xGPU);
1722 
1723   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1724   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1725   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1726   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1727   PetscFunctionReturn(0);
1728 }
1729 
1730 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1731 {
1732   const PetscScalar                 *barray;
1733   PetscScalar                       *xarray;
1734   cusparseStatus_t                  stat;
1735   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1738   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1739   PetscErrorCode                    ierr;
1740 
1741   PetscFunctionBegin;
1742   /* Get the GPU pointers */
1743   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1744   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1745 
1746   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1747   /* First, solve L */
1748   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1749                         loTriFactor->csrMat->num_rows,
1750                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1751                         loTriFactor->csrMat->num_entries,
1752                       #endif
1753                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1754                         loTriFactor->csrMat->values->data().get(),
1755                         loTriFactor->csrMat->row_offsets->data().get(),
1756                         loTriFactor->csrMat->column_indices->data().get(),
1757                         loTriFactor->solveInfo,
1758                         barray,
1759                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1760                         tempGPU->data().get(),
1761                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1762                       #else
1763                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1764                       #endif
1765 
1766   /* Next, solve U */
1767   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1768                         upTriFactor->csrMat->num_rows,
1769                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1770                         upTriFactor->csrMat->num_entries,
1771                       #endif
1772                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1773                         upTriFactor->csrMat->values->data().get(),
1774                         upTriFactor->csrMat->row_offsets->data().get(),
1775                         upTriFactor->csrMat->column_indices->data().get(),
1776                         upTriFactor->solveInfo,
1777                         tempGPU->data().get(),
1778                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1779                         xarray,
1780                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1781                       #else
1782                         xarray);CHKERRCUSPARSE(stat);
1783                       #endif
1784 
1785   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1786   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1787   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1788   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1793 {
1794   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1795   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1796   cudaError_t        cerr;
1797   PetscErrorCode     ierr;
1798 
1799   PetscFunctionBegin;
1800   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1801     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1802 
1803     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1804     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1805     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1806     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1807     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1808     A->offloadmask = PETSC_OFFLOAD_BOTH;
1809   }
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1814 {
1815   PetscErrorCode ierr;
1816 
1817   PetscFunctionBegin;
1818   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1819   *array = ((Mat_SeqAIJ*)A->data)->a;
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1824 {
1825   PetscFunctionBegin;
1826   A->offloadmask = PETSC_OFFLOAD_CPU;
1827   *array         = NULL;
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1832 {
1833   PetscErrorCode ierr;
1834 
1835   PetscFunctionBegin;
1836   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1837   *array = ((Mat_SeqAIJ*)A->data)->a;
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1842 {
1843   PetscFunctionBegin;
1844   *array = NULL;
1845   PetscFunctionReturn(0);
1846 }
1847 
1848 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1849 {
1850   PetscFunctionBegin;
1851   *array = ((Mat_SeqAIJ*)A->data)->a;
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1856 {
1857   PetscFunctionBegin;
1858   A->offloadmask = PETSC_OFFLOAD_CPU;
1859   *array         = NULL;
1860   PetscFunctionReturn(0);
1861 }
1862 
1863 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1864 {
1865   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1866   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1867   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1868   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1869   PetscErrorCode               ierr;
1870   cusparseStatus_t             stat;
1871   PetscBool                    both = PETSC_TRUE;
1872   cudaError_t                  err;
1873 
1874   PetscFunctionBegin;
1875   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1876   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1877     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1878       CsrMatrix *matrix;
1879       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1880 
1881       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1882       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1883       matrix->values->assign(a->a, a->a+a->nz);
1884       err  = WaitForCUDA();CHKERRCUDA(err);
1885       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1886       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1887       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1888     } else {
1889       PetscInt nnz;
1890       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1891       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1892       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1893       delete cusparsestruct->workVector;
1894       delete cusparsestruct->rowoffsets_gpu;
1895       cusparsestruct->workVector = NULL;
1896       cusparsestruct->rowoffsets_gpu = NULL;
1897       try {
1898         if (a->compressedrow.use) {
1899           m    = a->compressedrow.nrows;
1900           ii   = a->compressedrow.i;
1901           ridx = a->compressedrow.rindex;
1902         } else {
1903           m    = A->rmap->n;
1904           ii   = a->i;
1905           ridx = NULL;
1906         }
1907         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1908         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1909         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1910         else nnz = a->nz;
1911 
1912         /* create cusparse matrix */
1913         cusparsestruct->nrows = m;
1914         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1915         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1916         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1917         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1918 
1919         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1920         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1921         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1922         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1923         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1924         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1925         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1926 
1927         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1928         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1929           /* set the matrix */
1930           CsrMatrix *mat= new CsrMatrix;
1931           mat->num_rows = m;
1932           mat->num_cols = A->cmap->n;
1933           mat->num_entries = nnz;
1934           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1935           mat->row_offsets->assign(ii, ii + m+1);
1936 
1937           mat->column_indices = new THRUSTINTARRAY32(nnz);
1938           mat->column_indices->assign(a->j, a->j+nnz);
1939 
1940           mat->values = new THRUSTARRAY(nnz);
1941           if (a->a) mat->values->assign(a->a, a->a+nnz);
1942 
1943           /* assign the pointer */
1944           matstruct->mat = mat;
1945          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1946           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1947             stat = cusparseCreateCsr(&matstruct->matDescr,
1948                                     mat->num_rows, mat->num_cols, mat->num_entries,
1949                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1950                                     mat->values->data().get(),
1951                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1952                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1953           }
1954          #endif
1955         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1956          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1957           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1958          #else
1959           CsrMatrix *mat= new CsrMatrix;
1960           mat->num_rows = m;
1961           mat->num_cols = A->cmap->n;
1962           mat->num_entries = nnz;
1963           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1964           mat->row_offsets->assign(ii, ii + m+1);
1965 
1966           mat->column_indices = new THRUSTINTARRAY32(nnz);
1967           mat->column_indices->assign(a->j, a->j+nnz);
1968 
1969           mat->values = new THRUSTARRAY(nnz);
1970           if (a->a) mat->values->assign(a->a, a->a+nnz);
1971 
1972           cusparseHybMat_t hybMat;
1973           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1974           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1975             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1976           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1977               matstruct->descr, mat->values->data().get(),
1978               mat->row_offsets->data().get(),
1979               mat->column_indices->data().get(),
1980               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1981           /* assign the pointer */
1982           matstruct->mat = hybMat;
1983 
1984           if (mat) {
1985             if (mat->values) delete (THRUSTARRAY*)mat->values;
1986             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1987             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1988             delete (CsrMatrix*)mat;
1989           }
1990          #endif
1991         }
1992 
1993         /* assign the compressed row indices */
1994         if (a->compressedrow.use) {
1995           cusparsestruct->workVector = new THRUSTARRAY(m);
1996           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1997           matstruct->cprowIndices->assign(ridx,ridx+m);
1998           tmp = m;
1999         } else {
2000           cusparsestruct->workVector = NULL;
2001           matstruct->cprowIndices    = NULL;
2002           tmp = 0;
2003         }
2004         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2005 
2006         /* assign the pointer */
2007         cusparsestruct->mat = matstruct;
2008       } catch(char *ex) {
2009         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2010       }
2011       err  = WaitForCUDA();CHKERRCUDA(err);
2012       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2013       cusparsestruct->nonzerostate = A->nonzerostate;
2014     }
2015     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2016   }
2017   PetscFunctionReturn(0);
2018 }
2019 
2020 struct VecCUDAPlusEquals
2021 {
2022   template <typename Tuple>
2023   __host__ __device__
2024   void operator()(Tuple t)
2025   {
2026     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2027   }
2028 };
2029 
2030 struct VecCUDAEquals
2031 {
2032   template <typename Tuple>
2033   __host__ __device__
2034   void operator()(Tuple t)
2035   {
2036     thrust::get<1>(t) = thrust::get<0>(t);
2037   }
2038 };
2039 
2040 struct VecCUDAEqualsReverse
2041 {
2042   template <typename Tuple>
2043   __host__ __device__
2044   void operator()(Tuple t)
2045   {
2046     thrust::get<0>(t) = thrust::get<1>(t);
2047   }
2048 };
2049 
2050 struct MatMatCusparse {
2051   PetscBool             cisdense;
2052   PetscScalar           *Bt;
2053   Mat                   X;
2054   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2055   PetscLogDouble        flops;
2056   CsrMatrix             *Bcsr;
2057 
2058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2059   cusparseSpMatDescr_t  matSpBDescr;
2060   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2061   cusparseDnMatDescr_t  matBDescr;
2062   cusparseDnMatDescr_t  matCDescr;
2063   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2064  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2065   void                  *dBuffer4;
2066   void                  *dBuffer5;
2067  #endif
2068   size_t                mmBufferSize;
2069   void                  *mmBuffer;
2070   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2071   cusparseSpGEMMDescr_t spgemmDesc;
2072 #endif
2073 };
2074 
2075 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2076 {
2077   PetscErrorCode   ierr;
2078   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2079   cudaError_t      cerr;
2080  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2081   cusparseStatus_t stat;
2082  #endif
2083 
2084   PetscFunctionBegin;
2085   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2086   delete mmdata->Bcsr;
2087  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2089   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2090   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2091   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2092  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2093   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2094   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2095  #endif
2096   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2097   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2098  #endif
2099   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2100   ierr = PetscFree(data);CHKERRQ(ierr);
2101   PetscFunctionReturn(0);
2102 }
2103 
2104 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2105 
2106 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2107 {
2108   Mat_Product                  *product = C->product;
2109   Mat                          A,B;
2110   PetscInt                     m,n,blda,clda;
2111   PetscBool                    flg,biscuda;
2112   Mat_SeqAIJCUSPARSE           *cusp;
2113   cusparseStatus_t             stat;
2114   cusparseOperation_t          opA;
2115   const PetscScalar            *barray;
2116   PetscScalar                  *carray;
2117   PetscErrorCode               ierr;
2118   MatMatCusparse               *mmdata;
2119   Mat_SeqAIJCUSPARSEMultStruct *mat;
2120   CsrMatrix                    *csrmat;
2121 
2122   PetscFunctionBegin;
2123   MatCheckProduct(C,1);
2124   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2125   mmdata = (MatMatCusparse*)product->data;
2126   A    = product->A;
2127   B    = product->B;
2128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2129   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2130   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2131      Instead of silently accepting the wrong answer, I prefer to raise the error */
2132   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2133   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2134   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2135   switch (product->type) {
2136   case MATPRODUCT_AB:
2137   case MATPRODUCT_PtAP:
2138     mat = cusp->mat;
2139     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2140     m   = A->rmap->n;
2141     n   = B->cmap->n;
2142     break;
2143   case MATPRODUCT_AtB:
2144     if (!A->form_explicit_transpose) {
2145       mat = cusp->mat;
2146       opA = CUSPARSE_OPERATION_TRANSPOSE;
2147     } else {
2148       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2149       mat  = cusp->matTranspose;
2150       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2151     }
2152     m = A->cmap->n;
2153     n = B->cmap->n;
2154     break;
2155   case MATPRODUCT_ABt:
2156   case MATPRODUCT_RARt:
2157     mat = cusp->mat;
2158     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2159     m   = A->rmap->n;
2160     n   = B->rmap->n;
2161     break;
2162   default:
2163     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2164   }
2165   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2166   csrmat = (CsrMatrix*)mat->mat;
2167   /* if the user passed a CPU matrix, copy the data to the GPU */
2168   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2169   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2170   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2171 
2172   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2173   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2174     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2175     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2176   } else {
2177     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2178     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2179   }
2180 
2181   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2182  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2183   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2184   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2185   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2186     size_t mmBufferSize;
2187     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2188     if (!mmdata->matBDescr) {
2189       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2190       mmdata->Blda = blda;
2191     }
2192 
2193     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2194     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2195       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2196       mmdata->Clda = clda;
2197     }
2198 
2199     if (!mat->matDescr) {
2200       stat = cusparseCreateCsr(&mat->matDescr,
2201                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2202                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2203                                csrmat->values->data().get(),
2204                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2205                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2206     }
2207     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2208                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2209                                    mmdata->matCDescr,cusparse_scalartype,
2210                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2211     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2212       cudaError_t cerr;
2213       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2214       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2215       mmdata->mmBufferSize = mmBufferSize;
2216     }
2217     mmdata->initialized = PETSC_TRUE;
2218   } else {
2219     /* to be safe, always update pointers of the mats */
2220     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2221     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2222     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2223   }
2224 
2225   /* do cusparseSpMM, which supports transpose on B */
2226   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2227                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2228                       mmdata->matCDescr,cusparse_scalartype,
2229                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2230  #else
2231   PetscInt k;
2232   /* cusparseXcsrmm does not support transpose on B */
2233   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2234     cublasHandle_t cublasv2handle;
2235     cublasStatus_t cerr;
2236 
2237     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2238     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2239                        B->cmap->n,B->rmap->n,
2240                        &PETSC_CUSPARSE_ONE ,barray,blda,
2241                        &PETSC_CUSPARSE_ZERO,barray,blda,
2242                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2243     blda = B->cmap->n;
2244     k    = B->cmap->n;
2245   } else {
2246     k    = B->rmap->n;
2247   }
2248 
2249   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2250   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2251                            csrmat->num_entries,mat->alpha_one,mat->descr,
2252                            csrmat->values->data().get(),
2253                            csrmat->row_offsets->data().get(),
2254                            csrmat->column_indices->data().get(),
2255                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2256                            carray,clda);CHKERRCUSPARSE(stat);
2257  #endif
2258   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2259   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2260   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2261   if (product->type == MATPRODUCT_RARt) {
2262     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2263     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2264   } else if (product->type == MATPRODUCT_PtAP) {
2265     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2266     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2267   } else {
2268     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2269   }
2270   if (mmdata->cisdense) {
2271     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2272   }
2273   if (!biscuda) {
2274     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2275   }
2276   PetscFunctionReturn(0);
2277 }
2278 
2279 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2280 {
2281   Mat_Product        *product = C->product;
2282   Mat                A,B;
2283   PetscInt           m,n;
2284   PetscBool          cisdense,flg;
2285   PetscErrorCode     ierr;
2286   MatMatCusparse     *mmdata;
2287   Mat_SeqAIJCUSPARSE *cusp;
2288 
2289   PetscFunctionBegin;
2290   MatCheckProduct(C,1);
2291   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2292   A    = product->A;
2293   B    = product->B;
2294   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2295   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2296   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2297   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2298   switch (product->type) {
2299   case MATPRODUCT_AB:
2300     m = A->rmap->n;
2301     n = B->cmap->n;
2302     break;
2303   case MATPRODUCT_AtB:
2304     m = A->cmap->n;
2305     n = B->cmap->n;
2306     break;
2307   case MATPRODUCT_ABt:
2308     m = A->rmap->n;
2309     n = B->rmap->n;
2310     break;
2311   case MATPRODUCT_PtAP:
2312     m = B->cmap->n;
2313     n = B->cmap->n;
2314     break;
2315   case MATPRODUCT_RARt:
2316     m = B->rmap->n;
2317     n = B->rmap->n;
2318     break;
2319   default:
2320     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2321   }
2322   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2323   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2324   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2325   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2326 
2327   /* product data */
2328   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2329   mmdata->cisdense = cisdense;
2330  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2331   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2332   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2333     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2334   }
2335  #endif
2336   /* for these products we need intermediate storage */
2337   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2338     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2339     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2340     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2341       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2342     } else {
2343       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2344     }
2345   }
2346   C->product->data    = mmdata;
2347   C->product->destroy = MatDestroy_MatMatCusparse;
2348 
2349   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2350   PetscFunctionReturn(0);
2351 }
2352 
2353 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2354 {
2355   Mat_Product                  *product = C->product;
2356   Mat                          A,B;
2357   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2358   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2359   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2360   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2361   PetscBool                    flg;
2362   PetscErrorCode               ierr;
2363   cusparseStatus_t             stat;
2364   cudaError_t                  cerr;
2365   MatProductType               ptype;
2366   MatMatCusparse               *mmdata;
2367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2368   cusparseSpMatDescr_t         BmatSpDescr;
2369 #endif
2370   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2371 
2372   PetscFunctionBegin;
2373   MatCheckProduct(C,1);
2374   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2375   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2376   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2377   mmdata = (MatMatCusparse*)C->product->data;
2378   A = product->A;
2379   B = product->B;
2380   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2381     mmdata->reusesym = PETSC_FALSE;
2382     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2383     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2384     Cmat = Ccusp->mat;
2385     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2386     Ccsr = (CsrMatrix*)Cmat->mat;
2387     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2388     goto finalize;
2389   }
2390   if (!c->nz) goto finalize;
2391   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2392   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2393   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2394   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2395   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2396   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2397   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2398   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2399   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2400   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2401   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2402   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2404   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2405 
2406   ptype = product->type;
2407   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2408     ptype = MATPRODUCT_AB;
2409     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2410   }
2411   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2412     ptype = MATPRODUCT_AB;
2413     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2414   }
2415   switch (ptype) {
2416   case MATPRODUCT_AB:
2417     Amat = Acusp->mat;
2418     Bmat = Bcusp->mat;
2419     break;
2420   case MATPRODUCT_AtB:
2421     Amat = Acusp->matTranspose;
2422     Bmat = Bcusp->mat;
2423     break;
2424   case MATPRODUCT_ABt:
2425     Amat = Acusp->mat;
2426     Bmat = Bcusp->matTranspose;
2427     break;
2428   default:
2429     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2430   }
2431   Cmat = Ccusp->mat;
2432   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2433   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2434   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2435   Acsr = (CsrMatrix*)Amat->mat;
2436   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2437   Ccsr = (CsrMatrix*)Cmat->mat;
2438   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2439   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2440   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2441   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2443   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2444   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2445   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2446     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2447                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2448                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2449                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2450   #else
2451     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2452                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2453                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2454                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2455     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2456                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2457                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2458   #endif
2459 #else
2460   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2461                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2462                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2463                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2464                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2465 #endif
2466   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2467   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2468   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2469   C->offloadmask = PETSC_OFFLOAD_GPU;
2470 finalize:
2471   /* shorter version of MatAssemblyEnd_SeqAIJ */
2472   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2473   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2474   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2475   c->reallocs         = 0;
2476   C->info.mallocs    += 0;
2477   C->info.nz_unneeded = 0;
2478   C->assembled = C->was_assembled = PETSC_TRUE;
2479   C->num_ass++;
2480   PetscFunctionReturn(0);
2481 }
2482 
2483 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2484 {
2485   Mat_Product                  *product = C->product;
2486   Mat                          A,B;
2487   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2488   Mat_SeqAIJ                   *a,*b,*c;
2489   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2490   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2491   PetscInt                     i,j,m,n,k;
2492   PetscBool                    flg;
2493   PetscErrorCode               ierr;
2494   cusparseStatus_t             stat;
2495   cudaError_t                  cerr;
2496   MatProductType               ptype;
2497   MatMatCusparse               *mmdata;
2498   PetscLogDouble               flops;
2499   PetscBool                    biscompressed,ciscompressed;
2500 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2501   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2502   cusparseSpMatDescr_t         BmatSpDescr;
2503 #else
2504   int                          cnz;
2505 #endif
2506   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2507 
2508   PetscFunctionBegin;
2509   MatCheckProduct(C,1);
2510   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2511   A    = product->A;
2512   B    = product->B;
2513   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2514   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2515   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2516   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2517   a = (Mat_SeqAIJ*)A->data;
2518   b = (Mat_SeqAIJ*)B->data;
2519   /* product data */
2520   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2521   C->product->data    = mmdata;
2522   C->product->destroy = MatDestroy_MatMatCusparse;
2523 
2524   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2525   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2526   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2527   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2528   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2529   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2530 
2531   ptype = product->type;
2532   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2533     ptype = MATPRODUCT_AB;
2534     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2535   }
2536   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2537     ptype = MATPRODUCT_AB;
2538     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2539   }
2540   biscompressed = PETSC_FALSE;
2541   ciscompressed = PETSC_FALSE;
2542   switch (ptype) {
2543   case MATPRODUCT_AB:
2544     m = A->rmap->n;
2545     n = B->cmap->n;
2546     k = A->cmap->n;
2547     Amat = Acusp->mat;
2548     Bmat = Bcusp->mat;
2549     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2550     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2551     break;
2552   case MATPRODUCT_AtB:
2553     m = A->cmap->n;
2554     n = B->cmap->n;
2555     k = A->rmap->n;
2556     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2557     Amat = Acusp->matTranspose;
2558     Bmat = Bcusp->mat;
2559     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2560     break;
2561   case MATPRODUCT_ABt:
2562     m = A->rmap->n;
2563     n = B->rmap->n;
2564     k = A->cmap->n;
2565     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2566     Amat = Acusp->mat;
2567     Bmat = Bcusp->matTranspose;
2568     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2569     break;
2570   default:
2571     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2572   }
2573 
2574   /* create cusparse matrix */
2575   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2576   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2577   c     = (Mat_SeqAIJ*)C->data;
2578   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2579   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2580   Ccsr  = new CsrMatrix;
2581 
2582   c->compressedrow.use = ciscompressed;
2583   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2584     c->compressedrow.nrows = a->compressedrow.nrows;
2585     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2586     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2587     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2588     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2589     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2590   } else {
2591     c->compressedrow.nrows  = 0;
2592     c->compressedrow.i      = NULL;
2593     c->compressedrow.rindex = NULL;
2594     Ccusp->workVector       = NULL;
2595     Cmat->cprowIndices      = NULL;
2596   }
2597   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2598   Ccusp->mat      = Cmat;
2599   Ccusp->mat->mat = Ccsr;
2600   Ccsr->num_rows    = Ccusp->nrows;
2601   Ccsr->num_cols    = n;
2602   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2603   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2604   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2605   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2606   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2607   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2608   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2609   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2610   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2611   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2613     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2614     c->nz = 0;
2615     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2616     Ccsr->values = new THRUSTARRAY(c->nz);
2617     goto finalizesym;
2618   }
2619 
2620   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2621   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2622   Acsr = (CsrMatrix*)Amat->mat;
2623   if (!biscompressed) {
2624     Bcsr = (CsrMatrix*)Bmat->mat;
2625 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2626     BmatSpDescr = Bmat->matDescr;
2627 #endif
2628   } else { /* we need to use row offsets for the full matrix */
2629     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2630     Bcsr = new CsrMatrix;
2631     Bcsr->num_rows       = B->rmap->n;
2632     Bcsr->num_cols       = cBcsr->num_cols;
2633     Bcsr->num_entries    = cBcsr->num_entries;
2634     Bcsr->column_indices = cBcsr->column_indices;
2635     Bcsr->values         = cBcsr->values;
2636     if (!Bcusp->rowoffsets_gpu) {
2637       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2638       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2639       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2640     }
2641     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2642     mmdata->Bcsr = Bcsr;
2643 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2644     if (Bcsr->num_rows && Bcsr->num_cols) {
2645       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2646                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2647                                Bcsr->values->data().get(),
2648                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2649                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2650     }
2651     BmatSpDescr = mmdata->matSpBDescr;
2652 #endif
2653   }
2654   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2655   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2656   /* precompute flops count */
2657   if (ptype == MATPRODUCT_AB) {
2658     for (i=0, flops = 0; i<A->rmap->n; i++) {
2659       const PetscInt st = a->i[i];
2660       const PetscInt en = a->i[i+1];
2661       for (j=st; j<en; j++) {
2662         const PetscInt brow = a->j[j];
2663         flops += 2.*(b->i[brow+1] - b->i[brow]);
2664       }
2665     }
2666   } else if (ptype == MATPRODUCT_AtB) {
2667     for (i=0, flops = 0; i<A->rmap->n; i++) {
2668       const PetscInt anzi = a->i[i+1] - a->i[i];
2669       const PetscInt bnzi = b->i[i+1] - b->i[i];
2670       flops += (2.*anzi)*bnzi;
2671     }
2672   } else { /* TODO */
2673     flops = 0.;
2674   }
2675 
2676   mmdata->flops = flops;
2677   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2678 
2679 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2680   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2681   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2682                           NULL, NULL, NULL,
2683                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2684                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2685   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2686  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2687  {
2688   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2689      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2690   */
2691   void*  dBuffer1 = NULL;
2692   void*  dBuffer2 = NULL;
2693   void*  dBuffer3 = NULL;
2694   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2695   size_t bufferSize1 = 0;
2696   size_t bufferSize2 = 0;
2697   size_t bufferSize3 = 0;
2698   size_t bufferSize4 = 0;
2699   size_t bufferSize5 = 0;
2700 
2701   /*----------------------------------------------------------------------*/
2702   /* ask bufferSize1 bytes for external memory */
2703   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2704                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2705                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2706   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2707   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2709                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2710                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2711 
2712   /*----------------------------------------------------------------------*/
2713   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2714                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2715                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2716   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2717   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2718   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2719   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2720                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2721                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2722   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2723   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2724 
2725   /*----------------------------------------------------------------------*/
2726   /* get matrix C non-zero entries C_nnz1 */
2727   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2728   c->nz = (PetscInt) C_nnz1;
2729   /* allocate matrix C */
2730   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2731   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2732   /* update matC with the new pointers */
2733   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2734                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2735 
2736   /*----------------------------------------------------------------------*/
2737   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2738                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2739                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2740   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2741   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2742                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2743                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2744   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2745   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2746                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2747                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2748                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2749   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2750  }
2751  #else
2752   size_t bufSize2;
2753   /* ask bufferSize bytes for external memory */
2754   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2755                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2756                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2757                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2758   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2759   /* inspect the matrices A and B to understand the memory requirement for the next step */
2760   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2761                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2762                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2763                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2764   /* ask bufferSize again bytes for external memory */
2765   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2766                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2767                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2768                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2769   /* The CUSPARSE documentation is not clear, nor the API
2770      We need both buffers to perform the operations properly!
2771      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2772      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2773      is stored in the descriptor! What a messy API... */
2774   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2775   /* compute the intermediate product of A * B */
2776   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2777                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2778                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2779                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2780   /* get matrix C non-zero entries C_nnz1 */
2781   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2782   c->nz = (PetscInt) C_nnz1;
2783   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2784   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2785   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2786   Ccsr->values = new THRUSTARRAY(c->nz);
2787   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2788   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2789                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2790   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2791                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2792                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2793  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2794 #else
2795   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2796   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2797                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2798                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2799                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2800                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2801   c->nz = cnz;
2802   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2803   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2804   Ccsr->values = new THRUSTARRAY(c->nz);
2805   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2806 
2807   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2808   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2809      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2810      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2811   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2812                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2813                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2814                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2815                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2816 #endif
2817   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2818   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2819 finalizesym:
2820   c->singlemalloc = PETSC_FALSE;
2821   c->free_a       = PETSC_TRUE;
2822   c->free_ij      = PETSC_TRUE;
2823   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2824   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2825   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2826     PetscInt *d_i = c->i;
2827     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2828     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2829     ii   = *Ccsr->row_offsets;
2830     jj   = *Ccsr->column_indices;
2831     if (ciscompressed) d_i = c->compressedrow.i;
2832     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2833     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2834   } else {
2835     PetscInt *d_i = c->i;
2836     if (ciscompressed) d_i = c->compressedrow.i;
2837     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2838     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839   }
2840   if (ciscompressed) { /* need to expand host row offsets */
2841     PetscInt r = 0;
2842     c->i[0] = 0;
2843     for (k = 0; k < c->compressedrow.nrows; k++) {
2844       const PetscInt next = c->compressedrow.rindex[k];
2845       const PetscInt old = c->compressedrow.i[k];
2846       for (; r < next; r++) c->i[r+1] = old;
2847     }
2848     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2849   }
2850   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2851   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2852   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2853   c->maxnz = c->nz;
2854   c->nonzerorowcnt = 0;
2855   c->rmax = 0;
2856   for (k = 0; k < m; k++) {
2857     const PetscInt nn = c->i[k+1] - c->i[k];
2858     c->ilen[k] = c->imax[k] = nn;
2859     c->nonzerorowcnt += (PetscInt)!!nn;
2860     c->rmax = PetscMax(c->rmax,nn);
2861   }
2862   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2863   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2864   Ccsr->num_entries = c->nz;
2865 
2866   C->nonzerostate++;
2867   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2868   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2869   Ccusp->nonzerostate = C->nonzerostate;
2870   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2871   C->preallocated  = PETSC_TRUE;
2872   C->assembled     = PETSC_FALSE;
2873   C->was_assembled = PETSC_FALSE;
2874   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2875     mmdata->reusesym = PETSC_TRUE;
2876     C->offloadmask   = PETSC_OFFLOAD_GPU;
2877   }
2878   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2879   PetscFunctionReturn(0);
2880 }
2881 
2882 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2883 
2884 /* handles sparse or dense B */
2885 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2886 {
2887   Mat_Product    *product = mat->product;
2888   PetscErrorCode ierr;
2889   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2890 
2891   PetscFunctionBegin;
2892   MatCheckProduct(mat,1);
2893   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2894   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2895     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2896   }
2897   if (product->type == MATPRODUCT_ABC) {
2898     Ciscusp = PETSC_FALSE;
2899     if (!product->C->boundtocpu) {
2900       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2901     }
2902   }
2903   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2904     PetscBool usecpu = PETSC_FALSE;
2905     switch (product->type) {
2906     case MATPRODUCT_AB:
2907       if (product->api_user) {
2908         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2909         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2910         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2911       } else {
2912         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2913         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2914         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2915       }
2916       break;
2917     case MATPRODUCT_AtB:
2918       if (product->api_user) {
2919         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2920         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2921         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2922       } else {
2923         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2924         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2925         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2926       }
2927       break;
2928     case MATPRODUCT_PtAP:
2929       if (product->api_user) {
2930         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2931         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2932         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2933       } else {
2934         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2935         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2936         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2937       }
2938       break;
2939     case MATPRODUCT_RARt:
2940       if (product->api_user) {
2941         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2942         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2943         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2944       } else {
2945         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2946         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2947         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2948       }
2949       break;
2950     case MATPRODUCT_ABC:
2951       if (product->api_user) {
2952         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2953         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2954         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2955       } else {
2956         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2957         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2958         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2959       }
2960       break;
2961     default:
2962       break;
2963     }
2964     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2965   }
2966   /* dispatch */
2967   if (isdense) {
2968     switch (product->type) {
2969     case MATPRODUCT_AB:
2970     case MATPRODUCT_AtB:
2971     case MATPRODUCT_ABt:
2972     case MATPRODUCT_PtAP:
2973     case MATPRODUCT_RARt:
2974      if (product->A->boundtocpu) {
2975         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2976       } else {
2977         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2978       }
2979       break;
2980     case MATPRODUCT_ABC:
2981       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2982       break;
2983     default:
2984       break;
2985     }
2986   } else if (Biscusp && Ciscusp) {
2987     switch (product->type) {
2988     case MATPRODUCT_AB:
2989     case MATPRODUCT_AtB:
2990     case MATPRODUCT_ABt:
2991       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2992       break;
2993     case MATPRODUCT_PtAP:
2994     case MATPRODUCT_RARt:
2995     case MATPRODUCT_ABC:
2996       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2997       break;
2998     default:
2999       break;
3000     }
3001   } else { /* fallback for AIJ */
3002     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3003   }
3004   PetscFunctionReturn(0);
3005 }
3006 
3007 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3008 {
3009   PetscErrorCode ierr;
3010 
3011   PetscFunctionBegin;
3012   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3013   PetscFunctionReturn(0);
3014 }
3015 
3016 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3017 {
3018   PetscErrorCode ierr;
3019 
3020   PetscFunctionBegin;
3021   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3022   PetscFunctionReturn(0);
3023 }
3024 
3025 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3026 {
3027   PetscErrorCode ierr;
3028 
3029   PetscFunctionBegin;
3030   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3031   PetscFunctionReturn(0);
3032 }
3033 
3034 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3035 {
3036   PetscErrorCode ierr;
3037 
3038   PetscFunctionBegin;
3039   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3040   PetscFunctionReturn(0);
3041 }
3042 
3043 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3044 {
3045   PetscErrorCode ierr;
3046 
3047   PetscFunctionBegin;
3048   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3049   PetscFunctionReturn(0);
3050 }
3051 
3052 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3053 {
3054   int i = blockIdx.x*blockDim.x + threadIdx.x;
3055   if (i < n) y[idx[i]] += x[i];
3056 }
3057 
3058 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3059 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3060 {
3061   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3062   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3063   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3064   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3065   PetscErrorCode               ierr;
3066   cusparseStatus_t             stat;
3067   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3068   PetscBool                    compressed;
3069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070   PetscInt                     nx,ny;
3071 #endif
3072 
3073   PetscFunctionBegin;
3074   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3075   if (!a->nonzerorowcnt) {
3076     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3077     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3078     PetscFunctionReturn(0);
3079   }
3080   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3081   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3082   if (!trans) {
3083     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3084     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3085   } else {
3086     if (herm || !A->form_explicit_transpose) {
3087       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3088       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3089     } else {
3090       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3091       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3092     }
3093   }
3094   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3095   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3096 
3097   try {
3098     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3099     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3100     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3101 
3102     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3103     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3104       /* z = A x + beta y.
3105          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3106          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3107       */
3108       xptr = xarray;
3109       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3110       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3111      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3112       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3113           allocated to accommodate different uses. So we get the length info directly from mat.
3114        */
3115       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3116         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3117         nx = mat->num_cols;
3118         ny = mat->num_rows;
3119       }
3120      #endif
3121     } else {
3122       /* z = A^T x + beta y
3123          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3124          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3125        */
3126       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3127       dptr = zarray;
3128       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3129       if (compressed) { /* Scatter x to work vector */
3130         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3131         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3132                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3133                          VecCUDAEqualsReverse());
3134       }
3135      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3136       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3137         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3138         nx = mat->num_rows;
3139         ny = mat->num_cols;
3140       }
3141      #endif
3142     }
3143 
3144     /* csr_spmv does y = alpha op(A) x + beta y */
3145     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3146      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3147       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3148       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3149         cudaError_t cerr;
3150         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3151         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3152         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3153                                 matstruct->matDescr,
3154                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3155                                 matstruct->cuSpMV[opA].vecYDescr,
3156                                 cusparse_scalartype,
3157                                 cusparsestruct->spmvAlg,
3158                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3159         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3160 
3161         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3162       } else {
3163         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3164         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3165         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3166       }
3167 
3168       stat = cusparseSpMV(cusparsestruct->handle, opA,
3169                                matstruct->alpha_one,
3170                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3171                                matstruct->cuSpMV[opA].vecXDescr,
3172                                beta,
3173                                matstruct->cuSpMV[opA].vecYDescr,
3174                                cusparse_scalartype,
3175                                cusparsestruct->spmvAlg,
3176                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3177      #else
3178       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3179       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3180                                mat->num_rows, mat->num_cols,
3181                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3182                                mat->values->data().get(), mat->row_offsets->data().get(),
3183                                mat->column_indices->data().get(), xptr, beta,
3184                                dptr);CHKERRCUSPARSE(stat);
3185      #endif
3186     } else {
3187       if (cusparsestruct->nrows) {
3188        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3189         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3190        #else
3191         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3192         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3193                                  matstruct->alpha_one, matstruct->descr, hybMat,
3194                                  xptr, beta,
3195                                  dptr);CHKERRCUSPARSE(stat);
3196        #endif
3197       }
3198     }
3199     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3200 
3201     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3202       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3203         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3204           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3205         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3206           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3207         }
3208       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3209         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3210       }
3211 
3212       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3213       if (compressed) {
3214         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3215         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3216            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3217            prevent that. So I just add a ScatterAdd kernel.
3218          */
3219        #if 0
3220         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3221         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3222                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3223                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3224                          VecCUDAPlusEquals());
3225        #else
3226         PetscInt n = matstruct->cprowIndices->size();
3227         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3228        #endif
3229         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3230       }
3231     } else {
3232       if (yy && yy != zz) {
3233         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3234       }
3235     }
3236     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3237     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3238     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3239   } catch(char *ex) {
3240     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3241   }
3242   if (yy) {
3243     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3244   } else {
3245     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3246   }
3247   PetscFunctionReturn(0);
3248 }
3249 
3250 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3251 {
3252   PetscErrorCode ierr;
3253 
3254   PetscFunctionBegin;
3255   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3256   PetscFunctionReturn(0);
3257 }
3258 
3259 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3260 {
3261   PetscErrorCode     ierr;
3262   PetscObjectState   onnz = A->nonzerostate;
3263   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3264 
3265   PetscFunctionBegin;
3266   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3267   if (onnz != A->nonzerostate && cusp->deviceMat) {
3268     cudaError_t cerr;
3269 
3270     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3271     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3272     cusp->deviceMat = NULL;
3273   }
3274   PetscFunctionReturn(0);
3275 }
3276 
3277 /* --------------------------------------------------------------------------------*/
3278 /*@
3279    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3280    (the default parallel PETSc format). This matrix will ultimately pushed down
3281    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3282    assembly performance the user should preallocate the matrix storage by setting
3283    the parameter nz (or the array nnz).  By setting these parameters accurately,
3284    performance during matrix assembly can be increased by more than a factor of 50.
3285 
3286    Collective
3287 
3288    Input Parameters:
3289 +  comm - MPI communicator, set to PETSC_COMM_SELF
3290 .  m - number of rows
3291 .  n - number of columns
3292 .  nz - number of nonzeros per row (same for all rows)
3293 -  nnz - array containing the number of nonzeros in the various rows
3294          (possibly different for each row) or NULL
3295 
3296    Output Parameter:
3297 .  A - the matrix
3298 
3299    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3300    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3301    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3302 
3303    Notes:
3304    If nnz is given then nz is ignored
3305 
3306    The AIJ format (also called the Yale sparse matrix format or
3307    compressed row storage), is fully compatible with standard Fortran 77
3308    storage.  That is, the stored row and column indices can begin at
3309    either one (as in Fortran) or zero.  See the users' manual for details.
3310 
3311    Specify the preallocated storage with either nz or nnz (not both).
3312    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3313    allocation.  For large problems you MUST preallocate memory or you
3314    will get TERRIBLE performance, see the users' manual chapter on matrices.
3315 
3316    By default, this format uses inodes (identical nodes) when possible, to
3317    improve numerical efficiency of matrix-vector products and solves. We
3318    search for consecutive rows with the same nonzero structure, thereby
3319    reusing matrix information to achieve increased efficiency.
3320 
3321    Level: intermediate
3322 
3323 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3324 @*/
3325 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3326 {
3327   PetscErrorCode ierr;
3328 
3329   PetscFunctionBegin;
3330   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3331   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3332   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3333   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3334   PetscFunctionReturn(0);
3335 }
3336 
3337 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3338 {
3339   PetscErrorCode ierr;
3340 
3341   PetscFunctionBegin;
3342   if (A->factortype == MAT_FACTOR_NONE) {
3343     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3344   } else {
3345     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3346   }
3347   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3348   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3349   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3350   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3351   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3357   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3362 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3363 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3364 {
3365   PetscErrorCode ierr;
3366 
3367   PetscFunctionBegin;
3368   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3369   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3370   PetscFunctionReturn(0);
3371 }
3372 
3373 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3374 {
3375   PetscErrorCode     ierr;
3376   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3377   Mat_SeqAIJCUSPARSE *cy;
3378   Mat_SeqAIJCUSPARSE *cx;
3379   PetscScalar        *ay;
3380   const PetscScalar  *ax;
3381   CsrMatrix          *csry,*csrx;
3382 
3383   PetscFunctionBegin;
3384   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3385   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3386   if (X->ops->axpy != Y->ops->axpy) {
3387     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3388     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3389     PetscFunctionReturn(0);
3390   }
3391   /* if we are here, it means both matrices are bound to GPU */
3392   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3393   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3394   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3395   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3396   csry = (CsrMatrix*)cy->mat->mat;
3397   csrx = (CsrMatrix*)cx->mat->mat;
3398   /* see if we can turn this into a cublas axpy */
3399   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3400     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3401     if (eq) {
3402       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3403     }
3404     if (eq) str = SAME_NONZERO_PATTERN;
3405   }
3406   /* spgeam is buggy with one column */
3407   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3408 
3409   if (str == SUBSET_NONZERO_PATTERN) {
3410     cusparseStatus_t stat;
3411     PetscScalar      b = 1.0;
3412 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3413     size_t           bufferSize;
3414     void             *buffer;
3415     cudaError_t      cerr;
3416 #endif
3417 
3418     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3419     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3420     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3421 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3422     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3423                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3424                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3425                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3426     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3427     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3428     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3429                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3430                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3431                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3432     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3433     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3434     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3435 #else
3436     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3437     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3438                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3439                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3440                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3441     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3442     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3443 #endif
3444     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3445     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3446     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3447     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3448   } else if (str == SAME_NONZERO_PATTERN) {
3449     cublasHandle_t cublasv2handle;
3450     cublasStatus_t berr;
3451     PetscBLASInt   one = 1, bnz = 1;
3452 
3453     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3454     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3455     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3456     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3457     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3458     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3459     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3460     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3461     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3462     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3463     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3464   } else {
3465     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3466     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3467   }
3468   PetscFunctionReturn(0);
3469 }
3470 
3471 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3472 {
3473   PetscErrorCode ierr;
3474   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3475   PetscScalar    *ay;
3476   cublasHandle_t cublasv2handle;
3477   cublasStatus_t berr;
3478   PetscBLASInt   one = 1, bnz = 1;
3479 
3480   PetscFunctionBegin;
3481   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3482   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3483   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3484   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3485   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3486   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3487   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3488   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3489   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3490   PetscFunctionReturn(0);
3491 }
3492 
3493 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3494 {
3495   PetscErrorCode ierr;
3496   PetscBool      both = PETSC_FALSE;
3497   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3498 
3499   PetscFunctionBegin;
3500   if (A->factortype == MAT_FACTOR_NONE) {
3501     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3502     if (spptr->mat) {
3503       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3504       if (matrix->values) {
3505         both = PETSC_TRUE;
3506         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3507       }
3508     }
3509     if (spptr->matTranspose) {
3510       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3511       if (matrix->values) {
3512         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3513       }
3514     }
3515   }
3516   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3517   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3518   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3519   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3520   else A->offloadmask = PETSC_OFFLOAD_CPU;
3521   PetscFunctionReturn(0);
3522 }
3523 
3524 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3525 {
3526   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3527   PetscErrorCode ierr;
3528 
3529   PetscFunctionBegin;
3530   if (A->factortype != MAT_FACTOR_NONE) {
3531     A->boundtocpu = flg;
3532     PetscFunctionReturn(0);
3533   }
3534   if (flg) {
3535     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3536 
3537     A->ops->scale                     = MatScale_SeqAIJ;
3538     A->ops->axpy                      = MatAXPY_SeqAIJ;
3539     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3540     A->ops->mult                      = MatMult_SeqAIJ;
3541     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3542     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3543     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3544     A->ops->multhermitiantranspose    = NULL;
3545     A->ops->multhermitiantransposeadd = NULL;
3546     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3547     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3548     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3549     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3550     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3551     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3552     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3553     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3554     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3555   } else {
3556     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3557     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3558     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3559     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3560     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3561     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3562     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3563     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3564     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3565     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3566     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3567     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3568     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3569     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3570     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3571     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3572     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3573     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3574     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578   }
3579   A->boundtocpu = flg;
3580   if (flg && a->inode.size) {
3581     a->inode.use = PETSC_TRUE;
3582   } else {
3583     a->inode.use = PETSC_FALSE;
3584   }
3585   PetscFunctionReturn(0);
3586 }
3587 
3588 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3589 {
3590   PetscErrorCode   ierr;
3591   cusparseStatus_t stat;
3592   Mat              B;
3593 
3594   PetscFunctionBegin;
3595   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3596   if (reuse == MAT_INITIAL_MATRIX) {
3597     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3598   } else if (reuse == MAT_REUSE_MATRIX) {
3599     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3600   }
3601   B = *newmat;
3602 
3603   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3604   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3605 
3606   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3607     if (B->factortype == MAT_FACTOR_NONE) {
3608       Mat_SeqAIJCUSPARSE *spptr;
3609       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3610       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3611       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3612       spptr->format     = MAT_CUSPARSE_CSR;
3613      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3614      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3615       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3616      #else
3617       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3618      #endif
3619       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3620       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3621      #endif
3622       B->spptr = spptr;
3623     } else {
3624       Mat_SeqAIJCUSPARSETriFactors *spptr;
3625 
3626       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3627       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3628       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3629       B->spptr = spptr;
3630     }
3631     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3632   }
3633   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3634   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3635   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3636   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3637   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3638   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3639 
3640   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3641   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3642   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3643 #if defined(PETSC_HAVE_HYPRE)
3644   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3645 #endif
3646   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3647   PetscFunctionReturn(0);
3648 }
3649 
3650 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3651 {
3652   PetscErrorCode ierr;
3653 
3654   PetscFunctionBegin;
3655   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3656   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3657   PetscFunctionReturn(0);
3658 }
3659 
3660 /*MC
3661    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3662 
3663    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3664    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3665    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3666 
3667    Options Database Keys:
3668 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3669 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3670 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3671 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3672 
3673   Level: beginner
3674 
3675 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3676 M*/
3677 
3678 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3679 
3680 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3681 {
3682   PetscErrorCode ierr;
3683 
3684   PetscFunctionBegin;
3685   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3686   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3687   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3688   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3689   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3690 
3691   PetscFunctionReturn(0);
3692 }
3693 
3694 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3695 {
3696   PetscErrorCode   ierr;
3697   cusparseStatus_t stat;
3698 
3699   PetscFunctionBegin;
3700   if (*cusparsestruct) {
3701     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3702     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3703     delete (*cusparsestruct)->workVector;
3704     delete (*cusparsestruct)->rowoffsets_gpu;
3705     delete (*cusparsestruct)->cooPerm;
3706     delete (*cusparsestruct)->cooPerm_a;
3707     delete (*cusparsestruct)->csr2csc_i;
3708     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3709     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3710   }
3711   PetscFunctionReturn(0);
3712 }
3713 
3714 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3715 {
3716   PetscFunctionBegin;
3717   if (*mat) {
3718     delete (*mat)->values;
3719     delete (*mat)->column_indices;
3720     delete (*mat)->row_offsets;
3721     delete *mat;
3722     *mat = 0;
3723   }
3724   PetscFunctionReturn(0);
3725 }
3726 
3727 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3728 {
3729   cusparseStatus_t stat;
3730   PetscErrorCode   ierr;
3731 
3732   PetscFunctionBegin;
3733   if (*trifactor) {
3734     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3735     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3736     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3737     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3738     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3739    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3740     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3741    #endif
3742     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3743   }
3744   PetscFunctionReturn(0);
3745 }
3746 
3747 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3748 {
3749   CsrMatrix        *mat;
3750   cusparseStatus_t stat;
3751   cudaError_t      err;
3752 
3753   PetscFunctionBegin;
3754   if (*matstruct) {
3755     if ((*matstruct)->mat) {
3756       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3757        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3758         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3759        #else
3760         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3761         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3762        #endif
3763       } else {
3764         mat = (CsrMatrix*)(*matstruct)->mat;
3765         CsrMatrix_Destroy(&mat);
3766       }
3767     }
3768     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3769     delete (*matstruct)->cprowIndices;
3770     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3771     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3772     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3773 
3774    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3775     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3776     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3777     for (int i=0; i<3; i++) {
3778       if (mdata->cuSpMV[i].initialized) {
3779         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3780         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3781         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3782       }
3783     }
3784    #endif
3785     delete *matstruct;
3786     *matstruct = NULL;
3787   }
3788   PetscFunctionReturn(0);
3789 }
3790 
3791 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3792 {
3793   PetscErrorCode ierr;
3794 
3795   PetscFunctionBegin;
3796   if (*trifactors) {
3797     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3798     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3799     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3800     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3801     delete (*trifactors)->rpermIndices;
3802     delete (*trifactors)->cpermIndices;
3803     delete (*trifactors)->workVector;
3804     (*trifactors)->rpermIndices = NULL;
3805     (*trifactors)->cpermIndices = NULL;
3806     (*trifactors)->workVector = NULL;
3807     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3808     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3809     (*trifactors)->init_dev_prop = PETSC_FALSE;
3810   }
3811   PetscFunctionReturn(0);
3812 }
3813 
3814 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3815 {
3816   PetscErrorCode   ierr;
3817   cusparseHandle_t handle;
3818   cusparseStatus_t stat;
3819 
3820   PetscFunctionBegin;
3821   if (*trifactors) {
3822     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3823     if (handle = (*trifactors)->handle) {
3824       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3825     }
3826     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3827   }
3828   PetscFunctionReturn(0);
3829 }
3830 
3831 struct IJCompare
3832 {
3833   __host__ __device__
3834   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3835   {
3836     if (t1.get<0>() < t2.get<0>()) return true;
3837     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3838     return false;
3839   }
3840 };
3841 
3842 struct IJEqual
3843 {
3844   __host__ __device__
3845   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3846   {
3847     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3848     return true;
3849   }
3850 };
3851 
3852 struct IJDiff
3853 {
3854   __host__ __device__
3855   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3856   {
3857     return t1 == t2 ? 0 : 1;
3858   }
3859 };
3860 
3861 struct IJSum
3862 {
3863   __host__ __device__
3864   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3865   {
3866     return t1||t2;
3867   }
3868 };
3869 
3870 #include <thrust/iterator/discard_iterator.h>
3871 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3872 {
3873   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3875   THRUSTARRAY                           *cooPerm_v = NULL;
3876   thrust::device_ptr<const PetscScalar> d_v;
3877   CsrMatrix                             *matrix;
3878   PetscErrorCode                        ierr;
3879   PetscInt                              n;
3880 
3881   PetscFunctionBegin;
3882   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3883   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3884   if (!cusp->cooPerm) {
3885     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3886     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3887     PetscFunctionReturn(0);
3888   }
3889   matrix = (CsrMatrix*)cusp->mat->mat;
3890   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3891   if (!v) {
3892     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3893     goto finalize;
3894   }
3895   n = cusp->cooPerm->size();
3896   if (isCudaMem(v)) {
3897     d_v = thrust::device_pointer_cast(v);
3898   } else {
3899     cooPerm_v = new THRUSTARRAY(n);
3900     cooPerm_v->assign(v,v+n);
3901     d_v = cooPerm_v->data();
3902     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3903   }
3904   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3905   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3906     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3907       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3908       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3909       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3910         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3911         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3912       */
3913       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3914       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3915       delete cooPerm_w;
3916     } else {
3917       /* all nonzeros in d_v[] are unique entries */
3918       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3919                                                                 matrix->values->begin()));
3920       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3921                                                                 matrix->values->end()));
3922       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3923     }
3924   } else {
3925     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3926       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3927       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3928     } else {
3929       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3930                                                                 matrix->values->begin()));
3931       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3932                                                                 matrix->values->end()));
3933       thrust::for_each(zibit,zieit,VecCUDAEquals());
3934     }
3935   }
3936   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3937 finalize:
3938   delete cooPerm_v;
3939   A->offloadmask = PETSC_OFFLOAD_GPU;
3940   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3941   /* shorter version of MatAssemblyEnd_SeqAIJ */
3942   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3943   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3944   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3945   a->reallocs         = 0;
3946   A->info.mallocs    += 0;
3947   A->info.nz_unneeded = 0;
3948   A->assembled = A->was_assembled = PETSC_TRUE;
3949   A->num_ass++;
3950   PetscFunctionReturn(0);
3951 }
3952 
3953 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3954 {
3955   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3956   PetscErrorCode     ierr;
3957 
3958   PetscFunctionBegin;
3959   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3960   if (!cusp) PetscFunctionReturn(0);
3961   if (destroy) {
3962     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3963     delete cusp->csr2csc_i;
3964     cusp->csr2csc_i = NULL;
3965   }
3966   A->transupdated = PETSC_FALSE;
3967   PetscFunctionReturn(0);
3968 }
3969 
3970 #include <thrust/binary_search.h>
3971 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3972 {
3973   PetscErrorCode     ierr;
3974   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3975   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3976   PetscInt           cooPerm_n, nzr = 0;
3977   cudaError_t        cerr;
3978 
3979   PetscFunctionBegin;
3980   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3981   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3982   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3983   if (n != cooPerm_n) {
3984     delete cusp->cooPerm;
3985     delete cusp->cooPerm_a;
3986     cusp->cooPerm = NULL;
3987     cusp->cooPerm_a = NULL;
3988   }
3989   if (n) {
3990     THRUSTINTARRAY d_i(n);
3991     THRUSTINTARRAY d_j(n);
3992     THRUSTINTARRAY ii(A->rmap->n);
3993 
3994     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3995     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3996 
3997     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3998     d_i.assign(coo_i,coo_i+n);
3999     d_j.assign(coo_j,coo_j+n);
4000 
4001     /* Ex.
4002       n = 6
4003       coo_i = [3,3,1,4,1,4]
4004       coo_j = [3,2,2,5,2,6]
4005     */
4006     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4007     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4008 
4009     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4010     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4011     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4012     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4013     THRUSTINTARRAY w = d_j;
4014 
4015     /*
4016       d_i     = [1,1,3,3,4,4]
4017       d_j     = [2,2,2,3,5,6]
4018       cooPerm = [2,4,1,0,3,5]
4019     */
4020     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4021 
4022     /*
4023       d_i     = [1,3,3,4,4,x]
4024                             ^ekey
4025       d_j     = [2,2,3,5,6,x]
4026                            ^nekye
4027     */
4028     if (nekey == ekey) { /* all entries are unique */
4029       delete cusp->cooPerm_a;
4030       cusp->cooPerm_a = NULL;
4031     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4032       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4033       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4034       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4035       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4036       w[0] = 0;
4037       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4038       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4039     }
4040     thrust::counting_iterator<PetscInt> search_begin(0);
4041     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4042                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4043                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4044     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4045 
4046     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4047     a->singlemalloc = PETSC_FALSE;
4048     a->free_a       = PETSC_TRUE;
4049     a->free_ij      = PETSC_TRUE;
4050     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4051     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4052     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4053     a->nz = a->maxnz = a->i[A->rmap->n];
4054     a->rmax = 0;
4055     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4056     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4057     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4058     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4059     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4060     for (PetscInt i = 0; i < A->rmap->n; i++) {
4061       const PetscInt nnzr = a->i[i+1] - a->i[i];
4062       nzr += (PetscInt)!!(nnzr);
4063       a->ilen[i] = a->imax[i] = nnzr;
4064       a->rmax = PetscMax(a->rmax,nnzr);
4065     }
4066     a->nonzerorowcnt = nzr;
4067     A->preallocated = PETSC_TRUE;
4068     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4069     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4070   } else {
4071     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4072   }
4073   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4074 
4075   /* We want to allocate the CUSPARSE struct for matvec now.
4076      The code is so convoluted now that I prefer to copy zeros */
4077   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4078   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4079   A->offloadmask = PETSC_OFFLOAD_CPU;
4080   A->nonzerostate++;
4081   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4082   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4083 
4084   A->assembled = PETSC_FALSE;
4085   A->was_assembled = PETSC_FALSE;
4086   PetscFunctionReturn(0);
4087 }
4088 
4089 /*@C
4090     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4091 
4092    Not collective
4093 
4094     Input Parameters:
4095 +   A - the matrix
4096 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4097 
4098     Output Parameters:
4099 +   ia - the CSR row pointers
4100 -   ja - the CSR column indices
4101 
4102     Level: developer
4103 
4104     Notes:
4105       When compressed is true, the CSR structure does not contain empty rows
4106 
4107 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4108 @*/
4109 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4110 {
4111   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4112   CsrMatrix          *csr;
4113   PetscErrorCode     ierr;
4114   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4115 
4116   PetscFunctionBegin;
4117   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4118   if (!i || !j) PetscFunctionReturn(0);
4119   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4120   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4121   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4122   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4123   csr = (CsrMatrix*)cusp->mat->mat;
4124   if (i) {
4125     if (!compressed && a->compressedrow.use) { /* need full row offset */
4126       if (!cusp->rowoffsets_gpu) {
4127         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4128         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4129         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4130       }
4131       *i = cusp->rowoffsets_gpu->data().get();
4132     } else *i = csr->row_offsets->data().get();
4133   }
4134   if (j) *j = csr->column_indices->data().get();
4135   PetscFunctionReturn(0);
4136 }
4137 
4138 /*@C
4139     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4140 
4141    Not collective
4142 
4143     Input Parameters:
4144 +   A - the matrix
4145 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4146 
4147     Output Parameters:
4148 +   ia - the CSR row pointers
4149 -   ja - the CSR column indices
4150 
4151     Level: developer
4152 
4153 .seealso: MatSeqAIJCUSPARSEGetIJ()
4154 @*/
4155 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4156 {
4157   PetscFunctionBegin;
4158   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4159   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4160   if (i) *i = NULL;
4161   if (j) *j = NULL;
4162   PetscFunctionReturn(0);
4163 }
4164 
4165 /*@C
4166    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4167 
4168    Not Collective
4169 
4170    Input Parameter:
4171 .   A - a MATSEQAIJCUSPARSE matrix
4172 
4173    Output Parameter:
4174 .   a - pointer to the device data
4175 
4176    Level: developer
4177 
4178    Notes: may trigger host-device copies if up-to-date matrix data is on host
4179 
4180 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4181 @*/
4182 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4183 {
4184   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4185   CsrMatrix          *csr;
4186   PetscErrorCode     ierr;
4187 
4188   PetscFunctionBegin;
4189   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4190   PetscValidPointer(a,2);
4191   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4192   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4193   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4194   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4195   csr = (CsrMatrix*)cusp->mat->mat;
4196   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4197   *a = csr->values->data().get();
4198   PetscFunctionReturn(0);
4199 }
4200 
4201 /*@C
4202    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4203 
4204    Not Collective
4205 
4206    Input Parameter:
4207 .   A - a MATSEQAIJCUSPARSE matrix
4208 
4209    Output Parameter:
4210 .   a - pointer to the device data
4211 
4212    Level: developer
4213 
4214 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4215 @*/
4216 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4217 {
4218   PetscFunctionBegin;
4219   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4220   PetscValidPointer(a,2);
4221   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4222   *a = NULL;
4223   PetscFunctionReturn(0);
4224 }
4225 
4226 /*@C
4227    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4228 
4229    Not Collective
4230 
4231    Input Parameter:
4232 .   A - a MATSEQAIJCUSPARSE matrix
4233 
4234    Output Parameter:
4235 .   a - pointer to the device data
4236 
4237    Level: developer
4238 
4239    Notes: may trigger host-device copies if up-to-date matrix data is on host
4240 
4241 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4242 @*/
4243 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4244 {
4245   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4246   CsrMatrix          *csr;
4247   PetscErrorCode     ierr;
4248 
4249   PetscFunctionBegin;
4250   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4251   PetscValidPointer(a,2);
4252   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4253   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4254   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4255   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4256   csr = (CsrMatrix*)cusp->mat->mat;
4257   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4258   *a = csr->values->data().get();
4259   A->offloadmask = PETSC_OFFLOAD_GPU;
4260   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4261   PetscFunctionReturn(0);
4262 }
4263 /*@C
4264    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4265 
4266    Not Collective
4267 
4268    Input Parameter:
4269 .   A - a MATSEQAIJCUSPARSE matrix
4270 
4271    Output Parameter:
4272 .   a - pointer to the device data
4273 
4274    Level: developer
4275 
4276 .seealso: MatSeqAIJCUSPARSEGetArray()
4277 @*/
4278 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4279 {
4280   PetscErrorCode ierr;
4281 
4282   PetscFunctionBegin;
4283   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4284   PetscValidPointer(a,2);
4285   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4286   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4287   *a = NULL;
4288   PetscFunctionReturn(0);
4289 }
4290 
4291 /*@C
4292    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4293 
4294    Not Collective
4295 
4296    Input Parameter:
4297 .   A - a MATSEQAIJCUSPARSE matrix
4298 
4299    Output Parameter:
4300 .   a - pointer to the device data
4301 
4302    Level: developer
4303 
4304    Notes: does not trigger host-device copies and flags data validity on the GPU
4305 
4306 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4307 @*/
4308 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4309 {
4310   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4311   CsrMatrix          *csr;
4312   PetscErrorCode     ierr;
4313 
4314   PetscFunctionBegin;
4315   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4316   PetscValidPointer(a,2);
4317   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4318   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4319   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4320   csr = (CsrMatrix*)cusp->mat->mat;
4321   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4322   *a = csr->values->data().get();
4323   A->offloadmask = PETSC_OFFLOAD_GPU;
4324   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4325   PetscFunctionReturn(0);
4326 }
4327 
4328 /*@C
4329    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4330 
4331    Not Collective
4332 
4333    Input Parameter:
4334 .   A - a MATSEQAIJCUSPARSE matrix
4335 
4336    Output Parameter:
4337 .   a - pointer to the device data
4338 
4339    Level: developer
4340 
4341 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4342 @*/
4343 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4344 {
4345   PetscErrorCode ierr;
4346 
4347   PetscFunctionBegin;
4348   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4349   PetscValidPointer(a,2);
4350   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4351   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4352   *a = NULL;
4353   PetscFunctionReturn(0);
4354 }
4355 
4356 struct IJCompare4
4357 {
4358   __host__ __device__
4359   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4360   {
4361     if (t1.get<0>() < t2.get<0>()) return true;
4362     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4363     return false;
4364   }
4365 };
4366 
4367 struct Shift
4368 {
4369   int _shift;
4370 
4371   Shift(int shift) : _shift(shift) {}
4372   __host__ __device__
4373   inline int operator() (const int &c)
4374   {
4375     return c + _shift;
4376   }
4377 };
4378 
4379 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4380 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4381 {
4382   PetscErrorCode               ierr;
4383   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4384   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4385   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4386   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4387   PetscInt                     Annz,Bnnz;
4388   cusparseStatus_t             stat;
4389   PetscInt                     i,m,n,zero = 0;
4390   cudaError_t                  cerr;
4391 
4392   PetscFunctionBegin;
4393   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4394   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4395   PetscValidPointer(C,4);
4396   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4397   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4398   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4399   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4400   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4401   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4402   if (reuse == MAT_INITIAL_MATRIX) {
4403     m     = A->rmap->n;
4404     n     = A->cmap->n + B->cmap->n;
4405     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4406     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4407     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4408     c     = (Mat_SeqAIJ*)(*C)->data;
4409     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4410     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4411     Ccsr  = new CsrMatrix;
4412     Cmat->cprowIndices      = NULL;
4413     c->compressedrow.use    = PETSC_FALSE;
4414     c->compressedrow.nrows  = 0;
4415     c->compressedrow.i      = NULL;
4416     c->compressedrow.rindex = NULL;
4417     Ccusp->workVector       = NULL;
4418     Ccusp->nrows    = m;
4419     Ccusp->mat      = Cmat;
4420     Ccusp->mat->mat = Ccsr;
4421     Ccsr->num_rows  = m;
4422     Ccsr->num_cols  = n;
4423     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4424     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4425     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4426     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4427     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4428     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4429     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4430     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4431     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4433     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4434     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4435     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4436 
4437     Acsr = (CsrMatrix*)Acusp->mat->mat;
4438     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4439     Annz = (PetscInt)Acsr->column_indices->size();
4440     Bnnz = (PetscInt)Bcsr->column_indices->size();
4441     c->nz = Annz + Bnnz;
4442     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4443     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4444     Ccsr->values = new THRUSTARRAY(c->nz);
4445     Ccsr->num_entries = c->nz;
4446     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4447     if (c->nz) {
4448       auto Acoo = new THRUSTINTARRAY32(Annz);
4449       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4450       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4451       THRUSTINTARRAY32 *Aroff,*Broff;
4452 
4453       if (a->compressedrow.use) { /* need full row offset */
4454         if (!Acusp->rowoffsets_gpu) {
4455           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4456           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4457           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4458         }
4459         Aroff = Acusp->rowoffsets_gpu;
4460       } else Aroff = Acsr->row_offsets;
4461       if (b->compressedrow.use) { /* need full row offset */
4462         if (!Bcusp->rowoffsets_gpu) {
4463           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4464           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4465           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4466         }
4467         Broff = Bcusp->rowoffsets_gpu;
4468       } else Broff = Bcsr->row_offsets;
4469       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4470       stat = cusparseXcsr2coo(Acusp->handle,
4471                               Aroff->data().get(),
4472                               Annz,
4473                               m,
4474                               Acoo->data().get(),
4475                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4476       stat = cusparseXcsr2coo(Bcusp->handle,
4477                               Broff->data().get(),
4478                               Bnnz,
4479                               m,
4480                               Bcoo->data().get(),
4481                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4482       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4483       auto Aperm = thrust::make_constant_iterator(1);
4484       auto Bperm = thrust::make_constant_iterator(0);
4485 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4486       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4487       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4488 #else
4489       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4490       auto Bcib = Bcsr->column_indices->begin();
4491       auto Bcie = Bcsr->column_indices->end();
4492       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4493 #endif
4494       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4495       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4496       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4497       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4498       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4499       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4500       auto p1 = Ccusp->cooPerm->begin();
4501       auto p2 = Ccusp->cooPerm->begin();
4502       thrust::advance(p2,Annz);
4503       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4504 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4505       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4506 #endif
4507       auto cci = thrust::make_counting_iterator(zero);
4508       auto cce = thrust::make_counting_iterator(c->nz);
4509 #if 0 //Errors on SUMMIT cuda 11.1.0
4510       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4511 #else
4512       auto pred = thrust::identity<int>();
4513       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4514       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4515 #endif
4516       stat = cusparseXcoo2csr(Ccusp->handle,
4517                               Ccoo->data().get(),
4518                               c->nz,
4519                               m,
4520                               Ccsr->row_offsets->data().get(),
4521                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4522       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4523       delete wPerm;
4524       delete Acoo;
4525       delete Bcoo;
4526       delete Ccoo;
4527 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4528       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4529                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4530                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4531                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4532 #endif
4533       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4534         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4535         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4536         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4537         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4538         CsrMatrix *CcsrT = new CsrMatrix;
4539         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4540         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4541 
4542         (*C)->form_explicit_transpose = PETSC_TRUE;
4543         (*C)->transupdated = PETSC_TRUE;
4544         Ccusp->rowoffsets_gpu = NULL;
4545         CmatT->cprowIndices = NULL;
4546         CmatT->mat = CcsrT;
4547         CcsrT->num_rows = n;
4548         CcsrT->num_cols = m;
4549         CcsrT->num_entries = c->nz;
4550 
4551         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4552         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4553         CcsrT->values = new THRUSTARRAY(c->nz);
4554 
4555         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4556         auto rT = CcsrT->row_offsets->begin();
4557         if (AT) {
4558           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4559           thrust::advance(rT,-1);
4560         }
4561         if (BT) {
4562           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4563           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4564           thrust::copy(titb,tite,rT);
4565         }
4566         auto cT = CcsrT->column_indices->begin();
4567         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4568         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4569         auto vT = CcsrT->values->begin();
4570         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4571         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4572         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4573 
4574         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4575         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4576         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4577         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4578         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4579         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4580         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4581         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4582         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4584         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4585                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4586                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4587                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4588 #endif
4589         Ccusp->matTranspose = CmatT;
4590       }
4591     }
4592 
4593     c->singlemalloc = PETSC_FALSE;
4594     c->free_a       = PETSC_TRUE;
4595     c->free_ij      = PETSC_TRUE;
4596     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4597     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4598     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4599       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4600       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4601       ii   = *Ccsr->row_offsets;
4602       jj   = *Ccsr->column_indices;
4603       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4604       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4605     } else {
4606       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4607       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4608     }
4609     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4610     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4611     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4612     c->maxnz = c->nz;
4613     c->nonzerorowcnt = 0;
4614     c->rmax = 0;
4615     for (i = 0; i < m; i++) {
4616       const PetscInt nn = c->i[i+1] - c->i[i];
4617       c->ilen[i] = c->imax[i] = nn;
4618       c->nonzerorowcnt += (PetscInt)!!nn;
4619       c->rmax = PetscMax(c->rmax,nn);
4620     }
4621     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4622     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4623     (*C)->nonzerostate++;
4624     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4625     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4626     Ccusp->nonzerostate = (*C)->nonzerostate;
4627     (*C)->preallocated  = PETSC_TRUE;
4628   } else {
4629     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4630     c = (Mat_SeqAIJ*)(*C)->data;
4631     if (c->nz) {
4632       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4633       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4634       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4635       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4636       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4637       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4638       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4639       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4640       Acsr = (CsrMatrix*)Acusp->mat->mat;
4641       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4642       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4643       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4644       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4645       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4646       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4647       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4648       auto pmid = Ccusp->cooPerm->begin();
4649       thrust::advance(pmid,Acsr->num_entries);
4650       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4651       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4652                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4653       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4654                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4655       thrust::for_each(zibait,zieait,VecCUDAEquals());
4656       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4657                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4658       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4659                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4660       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4661       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4662       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4663         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4664         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4665         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4666         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4667         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4668         auto vT = CcsrT->values->begin();
4669         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4670         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4671         (*C)->transupdated = PETSC_TRUE;
4672       }
4673       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4674     }
4675   }
4676   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4677   (*C)->assembled     = PETSC_TRUE;
4678   (*C)->was_assembled = PETSC_FALSE;
4679   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4680   PetscFunctionReturn(0);
4681 }
4682 
4683 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4684 {
4685   PetscErrorCode    ierr;
4686   bool              dmem;
4687   const PetscScalar *av;
4688   cudaError_t       cerr;
4689 
4690   PetscFunctionBegin;
4691   dmem = isCudaMem(v);
4692   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4693   if (n && idx) {
4694     THRUSTINTARRAY widx(n);
4695     widx.assign(idx,idx+n);
4696     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4697 
4698     THRUSTARRAY *w = NULL;
4699     thrust::device_ptr<PetscScalar> dv;
4700     if (dmem) {
4701       dv = thrust::device_pointer_cast(v);
4702     } else {
4703       w = new THRUSTARRAY(n);
4704       dv = w->data();
4705     }
4706     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4707 
4708     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4709     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4710     thrust::for_each(zibit,zieit,VecCUDAEquals());
4711     if (w) {
4712       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4713     }
4714     delete w;
4715   } else {
4716     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4717   }
4718   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4719   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4720   PetscFunctionReturn(0);
4721 }
4722