xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 05d37114d32dbc0e1c6335eb07be012574d7ef8c)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     if (!A->boundtocpu) {
169       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
170       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
171     } else {
172       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
173       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
174     }
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
179     if (!A->boundtocpu) {
180       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
181       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
182     } else {
183       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
184       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
185     }
186     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
187     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
188   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
189 
190   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
191   (*B)->canuseordering = PETSC_TRUE;
192   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
197 {
198   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
199 
200   PetscFunctionBegin;
201   switch (op) {
202   case MAT_CUSPARSE_MULT:
203     cusparsestruct->format = format;
204     break;
205   case MAT_CUSPARSE_ALL:
206     cusparsestruct->format = format;
207     break;
208   default:
209     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
210   }
211   PetscFunctionReturn(0);
212 }
213 
214 /*@
215    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
216    operation. Only the MatMult operation can use different GPU storage formats
217    for MPIAIJCUSPARSE matrices.
218    Not Collective
219 
220    Input Parameters:
221 +  A - Matrix of type SEQAIJCUSPARSE
222 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
223 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
224 
225    Output Parameter:
226 
227    Level: intermediate
228 
229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
230 @*/
231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
237   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
242 {
243   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
244 
245   PetscFunctionBegin;
246   cusparsestruct->use_cpu_solve = use_cpu;
247   PetscFunctionReturn(0);
248 }
249 
250 /*@
251    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
252 
253    Input Parameters:
254 +  A - Matrix of type SEQAIJCUSPARSE
255 -  use_cpu - set flag for using the built-in CPU MatSolve
256 
257    Output Parameter:
258 
259    Notes:
260    The cuSparse LU solver currently computes the factors with the built-in CPU method
261    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
262    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
263 
264    Level: intermediate
265 
266 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
267 @*/
268 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
269 {
270   PetscErrorCode ierr;
271 
272   PetscFunctionBegin;
273   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
274   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
275   PetscFunctionReturn(0);
276 }
277 
278 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
279 {
280   PetscErrorCode ierr;
281 
282   PetscFunctionBegin;
283   switch (op) {
284     case MAT_FORM_EXPLICIT_TRANSPOSE:
285       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
286       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
287       A->form_explicit_transpose = flg;
288       break;
289     default:
290       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
291       break;
292   }
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
297 
298 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
299 {
300   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
301   IS             isrow = b->row,iscol = b->col;
302   PetscBool      row_identity,col_identity;
303   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
304   PetscErrorCode ierr;
305 
306   PetscFunctionBegin;
307   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
308   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
309   B->offloadmask = PETSC_OFFLOAD_CPU;
310   /* determine which version of MatSolve needs to be used. */
311   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
312   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
313   if (row_identity && col_identity) {
314     if (!cusparsestruct->use_cpu_solve) {
315       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
316       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
317     }
318     B->ops->matsolve = NULL;
319     B->ops->matsolvetranspose = NULL;
320   } else {
321     if (!cusparsestruct->use_cpu_solve) {
322       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
323       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
324     }
325     B->ops->matsolve = NULL;
326     B->ops->matsolvetranspose = NULL;
327   }
328 
329   /* get the triangular factors */
330   if (!cusparsestruct->use_cpu_solve) {
331     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
332   }
333   PetscFunctionReturn(0);
334 }
335 
336 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
337 {
338   PetscErrorCode           ierr;
339   MatCUSPARSEStorageFormat format;
340   PetscBool                flg;
341   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
342 
343   PetscFunctionBegin;
344   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
345   if (A->factortype == MAT_FACTOR_NONE) {
346     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
347                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
348     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
349 
350     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
351                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
352     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
353     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
354     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
356     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
357                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
358     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
359 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
360     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
361 #else
362     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
363 #endif
364     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
365                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
366     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
367 
368     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
369                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
370     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
371    #endif
372   }
373   ierr = PetscOptionsTail();CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
378 {
379   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
380   PetscErrorCode               ierr;
381 
382   PetscFunctionBegin;
383   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
384   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
385   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
386   PetscFunctionReturn(0);
387 }
388 
389 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
390 {
391   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
392   PetscErrorCode               ierr;
393 
394   PetscFunctionBegin;
395   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
396   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
397   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
398   PetscFunctionReturn(0);
399 }
400 
401 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
402 {
403   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
404   PetscErrorCode               ierr;
405 
406   PetscFunctionBegin;
407   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
408   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
409   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410   PetscFunctionReturn(0);
411 }
412 
413 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
414 {
415   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
416   PetscErrorCode               ierr;
417 
418   PetscFunctionBegin;
419   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
420   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
421   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
422   PetscFunctionReturn(0);
423 }
424 
425 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
426 {
427   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
428   PetscInt                          n = A->rmap->n;
429   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
430   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
431   cusparseStatus_t                  stat;
432   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
433   const MatScalar                   *aa = a->a,*v;
434   PetscInt                          *AiLo, *AjLo;
435   PetscInt                          i,nz, nzLower, offset, rowOffset;
436   PetscErrorCode                    ierr;
437   cudaError_t                       cerr;
438 
439   PetscFunctionBegin;
440   if (!n) PetscFunctionReturn(0);
441   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
442     try {
443       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
444       nzLower=n+ai[n]-ai[1];
445       if (!loTriFactor) {
446         PetscScalar                       *AALo;
447 
448         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
449 
450         /* Allocate Space for the lower triangular matrix */
451         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
452         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
453 
454         /* Fill the lower triangular matrix */
455         AiLo[0]  = (PetscInt) 0;
456         AiLo[n]  = nzLower;
457         AjLo[0]  = (PetscInt) 0;
458         AALo[0]  = (MatScalar) 1.0;
459         v        = aa;
460         vi       = aj;
461         offset   = 1;
462         rowOffset= 1;
463         for (i=1; i<n; i++) {
464           nz = ai[i+1] - ai[i];
465           /* additional 1 for the term on the diagonal */
466           AiLo[i]    = rowOffset;
467           rowOffset += nz+1;
468 
469           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
470           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
471 
472           offset      += nz;
473           AjLo[offset] = (PetscInt) i;
474           AALo[offset] = (MatScalar) 1.0;
475           offset      += 1;
476 
477           v  += nz;
478           vi += nz;
479         }
480 
481         /* allocate space for the triangular factor information */
482         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
483         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
484         /* Create the matrix description */
485         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
486         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
487        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
488         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
489        #else
490         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
491        #endif
492         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
493         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
494 
495         /* set the operation */
496         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
497 
498         /* set the matrix */
499         loTriFactor->csrMat = new CsrMatrix;
500         loTriFactor->csrMat->num_rows = n;
501         loTriFactor->csrMat->num_cols = n;
502         loTriFactor->csrMat->num_entries = nzLower;
503 
504         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
505         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
506 
507         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
508         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
509 
510         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
511         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
512 
513         /* Create the solve analysis information */
514         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
515         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
516       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
517         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
518                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
519                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
520                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
521                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
522         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
523       #endif
524 
525         /* perform the solve analysis */
526         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
527                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
528                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
529                                  loTriFactor->csrMat->column_indices->data().get(),
530                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
531                                  loTriFactor->solveInfo,
532                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
533                                #else
534                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
535                                #endif
536         cerr = WaitForCUDA();CHKERRCUDA(cerr);
537         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
538 
539         /* assign the pointer */
540         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
541         loTriFactor->AA_h = AALo;
542         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
543         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
544         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
545       } else { /* update values only */
546         if (!loTriFactor->AA_h) {
547           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
548         }
549         /* Fill the lower triangular matrix */
550         loTriFactor->AA_h[0]  = 1.0;
551         v        = aa;
552         vi       = aj;
553         offset   = 1;
554         for (i=1; i<n; i++) {
555           nz = ai[i+1] - ai[i];
556           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
557           offset      += nz;
558           loTriFactor->AA_h[offset] = 1.0;
559           offset      += 1;
560           v  += nz;
561         }
562         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
563         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
564       }
565     } catch(char *ex) {
566       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
567     }
568   }
569   PetscFunctionReturn(0);
570 }
571 
572 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
573 {
574   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
575   PetscInt                          n = A->rmap->n;
576   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
577   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
578   cusparseStatus_t                  stat;
579   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
580   const MatScalar                   *aa = a->a,*v;
581   PetscInt                          *AiUp, *AjUp;
582   PetscInt                          i,nz, nzUpper, offset;
583   PetscErrorCode                    ierr;
584   cudaError_t                       cerr;
585 
586   PetscFunctionBegin;
587   if (!n) PetscFunctionReturn(0);
588   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
589     try {
590       /* next, figure out the number of nonzeros in the upper triangular matrix. */
591       nzUpper = adiag[0]-adiag[n];
592       if (!upTriFactor) {
593         PetscScalar *AAUp;
594 
595         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
596 
597         /* Allocate Space for the upper triangular matrix */
598         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
599         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
600 
601         /* Fill the upper triangular matrix */
602         AiUp[0]=(PetscInt) 0;
603         AiUp[n]=nzUpper;
604         offset = nzUpper;
605         for (i=n-1; i>=0; i--) {
606           v  = aa + adiag[i+1] + 1;
607           vi = aj + adiag[i+1] + 1;
608 
609           /* number of elements NOT on the diagonal */
610           nz = adiag[i] - adiag[i+1]-1;
611 
612           /* decrement the offset */
613           offset -= (nz+1);
614 
615           /* first, set the diagonal elements */
616           AjUp[offset] = (PetscInt) i;
617           AAUp[offset] = (MatScalar)1./v[nz];
618           AiUp[i]      = AiUp[i+1] - (nz+1);
619 
620           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
621           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
622         }
623 
624         /* allocate space for the triangular factor information */
625         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
626         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
627 
628         /* Create the matrix description */
629         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
630         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
631        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
632         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
633        #else
634         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
635        #endif
636         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
637         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
638 
639         /* set the operation */
640         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
641 
642         /* set the matrix */
643         upTriFactor->csrMat = new CsrMatrix;
644         upTriFactor->csrMat->num_rows = n;
645         upTriFactor->csrMat->num_cols = n;
646         upTriFactor->csrMat->num_entries = nzUpper;
647 
648         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
649         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
650 
651         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
652         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
653 
654         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
655         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
656 
657         /* Create the solve analysis information */
658         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
659         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
660       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
661         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
662                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
663                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
664                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
665                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
666         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
667       #endif
668 
669         /* perform the solve analysis */
670         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
671                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
672                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
673                                  upTriFactor->csrMat->column_indices->data().get(),
674                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
675                                  upTriFactor->solveInfo,
676                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
677                                #else
678                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
679                                #endif
680         cerr = WaitForCUDA();CHKERRCUDA(cerr);
681         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
682 
683         /* assign the pointer */
684         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
685         upTriFactor->AA_h = AAUp;
686         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
687         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
688         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
689       } else {
690         if (!upTriFactor->AA_h) {
691           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692         }
693         /* Fill the upper triangular matrix */
694         offset = nzUpper;
695         for (i=n-1; i>=0; i--) {
696           v  = aa + adiag[i+1] + 1;
697 
698           /* number of elements NOT on the diagonal */
699           nz = adiag[i] - adiag[i+1]-1;
700 
701           /* decrement the offset */
702           offset -= (nz+1);
703 
704           /* first, set the diagonal elements */
705           upTriFactor->AA_h[offset] = 1./v[nz];
706           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
707         }
708         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
709         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
710       }
711     } catch(char *ex) {
712       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
713     }
714   }
715   PetscFunctionReturn(0);
716 }
717 
718 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
719 {
720   PetscErrorCode               ierr;
721   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
722   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
723   IS                           isrow = a->row,iscol = a->icol;
724   PetscBool                    row_identity,col_identity;
725   PetscInt                     n = A->rmap->n;
726 
727   PetscFunctionBegin;
728   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
729   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
730   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
731 
732   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
733   cusparseTriFactors->nnz=a->nz;
734 
735   A->offloadmask = PETSC_OFFLOAD_BOTH;
736   /* lower triangular indices */
737   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
738   if (!row_identity && !cusparseTriFactors->rpermIndices) {
739     const PetscInt *r;
740 
741     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
742     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
743     cusparseTriFactors->rpermIndices->assign(r, r+n);
744     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
745     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
746   }
747 
748   /* upper triangular indices */
749   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
750   if (!col_identity && !cusparseTriFactors->cpermIndices) {
751     const PetscInt *c;
752 
753     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
754     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
755     cusparseTriFactors->cpermIndices->assign(c, c+n);
756     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
757     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
758   }
759   PetscFunctionReturn(0);
760 }
761 
762 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
763 {
764   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
765   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
766   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
767   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
768   cusparseStatus_t                  stat;
769   PetscErrorCode                    ierr;
770   cudaError_t                       cerr;
771   PetscInt                          *AiUp, *AjUp;
772   PetscScalar                       *AAUp;
773   PetscScalar                       *AALo;
774   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
775   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
776   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
777   const MatScalar                   *aa = b->a,*v;
778 
779   PetscFunctionBegin;
780   if (!n) PetscFunctionReturn(0);
781   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
782     try {
783       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
784       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
785       if (!upTriFactor && !loTriFactor) {
786         /* Allocate Space for the upper triangular matrix */
787         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
788         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
789 
790         /* Fill the upper triangular matrix */
791         AiUp[0]=(PetscInt) 0;
792         AiUp[n]=nzUpper;
793         offset = 0;
794         for (i=0; i<n; i++) {
795           /* set the pointers */
796           v  = aa + ai[i];
797           vj = aj + ai[i];
798           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
799 
800           /* first, set the diagonal elements */
801           AjUp[offset] = (PetscInt) i;
802           AAUp[offset] = (MatScalar)1.0/v[nz];
803           AiUp[i]      = offset;
804           AALo[offset] = (MatScalar)1.0/v[nz];
805 
806           offset+=1;
807           if (nz>0) {
808             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
809             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
810             for (j=offset; j<offset+nz; j++) {
811               AAUp[j] = -AAUp[j];
812               AALo[j] = AAUp[j]/v[nz];
813             }
814             offset+=nz;
815           }
816         }
817 
818         /* allocate space for the triangular factor information */
819         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
820         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
821 
822         /* Create the matrix description */
823         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
824         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
825        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
826         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
827        #else
828         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
829        #endif
830         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
831         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
832 
833         /* set the matrix */
834         upTriFactor->csrMat = new CsrMatrix;
835         upTriFactor->csrMat->num_rows = A->rmap->n;
836         upTriFactor->csrMat->num_cols = A->cmap->n;
837         upTriFactor->csrMat->num_entries = a->nz;
838 
839         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
847 
848         /* set the operation */
849         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
850 
851         /* Create the solve analysis information */
852         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
853         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
854       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
855         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
856                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
857                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
858                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
859                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
860         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
861       #endif
862 
863         /* perform the solve analysis */
864         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
865                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
866                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
867                                  upTriFactor->csrMat->column_indices->data().get(),
868                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
869                                  upTriFactor->solveInfo,
870                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
871                                 #else
872                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
873                                 #endif
874         cerr = WaitForCUDA();CHKERRCUDA(cerr);
875         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
876 
877         /* assign the pointer */
878         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
879 
880         /* allocate space for the triangular factor information */
881         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
882         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
883 
884         /* Create the matrix description */
885         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
886         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
887        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
888         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
889        #else
890         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
891        #endif
892         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
893         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
894 
895         /* set the operation */
896         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
897 
898         /* set the matrix */
899         loTriFactor->csrMat = new CsrMatrix;
900         loTriFactor->csrMat->num_rows = A->rmap->n;
901         loTriFactor->csrMat->num_cols = A->cmap->n;
902         loTriFactor->csrMat->num_entries = a->nz;
903 
904         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
906 
907         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
908         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
909 
910         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
911         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
912 
913         /* Create the solve analysis information */
914         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
915         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
916       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
917         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
918                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
919                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
920                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
921                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
922         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
923       #endif
924 
925         /* perform the solve analysis */
926         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
927                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
928                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
929                                  loTriFactor->csrMat->column_indices->data().get(),
930                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
931                                  loTriFactor->solveInfo,
932                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
933                                 #else
934                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
935                                 #endif
936         cerr = WaitForCUDA();CHKERRCUDA(cerr);
937         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
938 
939         /* assign the pointer */
940         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
941 
942         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
943         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
944         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
945       } else {
946         /* Fill the upper triangular matrix */
947         offset = 0;
948         for (i=0; i<n; i++) {
949           /* set the pointers */
950           v  = aa + ai[i];
951           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
952 
953           /* first, set the diagonal elements */
954           AAUp[offset] = 1.0/v[nz];
955           AALo[offset] = 1.0/v[nz];
956 
957           offset+=1;
958           if (nz>0) {
959             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
960             for (j=offset; j<offset+nz; j++) {
961               AAUp[j] = -AAUp[j];
962               AALo[j] = AAUp[j]/v[nz];
963             }
964             offset+=nz;
965           }
966         }
967         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
968         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
969         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
970         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
971         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
972       }
973       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
974       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
975     } catch(char *ex) {
976       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
977     }
978   }
979   PetscFunctionReturn(0);
980 }
981 
982 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
983 {
984   PetscErrorCode               ierr;
985   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
986   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
987   IS                           ip = a->row;
988   PetscBool                    perm_identity;
989   PetscInt                     n = A->rmap->n;
990 
991   PetscFunctionBegin;
992   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
993   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
994   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
995   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
996 
997   A->offloadmask = PETSC_OFFLOAD_BOTH;
998 
999   /* lower triangular indices */
1000   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1001   if (!perm_identity) {
1002     IS             iip;
1003     const PetscInt *irip,*rip;
1004 
1005     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
1006     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1007     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1008     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1009     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1010     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1011     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
1012     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
1013     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1014     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1015     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1016   }
1017   PetscFunctionReturn(0);
1018 }
1019 
1020 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1021 {
1022   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1023   IS             ip = b->row;
1024   PetscBool      perm_identity;
1025   PetscErrorCode ierr;
1026 
1027   PetscFunctionBegin;
1028   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1029   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1030   B->offloadmask = PETSC_OFFLOAD_CPU;
1031   /* determine which version of MatSolve needs to be used. */
1032   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1033   if (perm_identity) {
1034     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1035     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1036     B->ops->matsolve = NULL;
1037     B->ops->matsolvetranspose = NULL;
1038   } else {
1039     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1040     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1041     B->ops->matsolve = NULL;
1042     B->ops->matsolvetranspose = NULL;
1043   }
1044 
1045   /* get the triangular factors */
1046   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1051 {
1052   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1053   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1054   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1055   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1056   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1057   cusparseStatus_t                  stat;
1058   cusparseIndexBase_t               indexBase;
1059   cusparseMatrixType_t              matrixType;
1060   cusparseFillMode_t                fillMode;
1061   cusparseDiagType_t                diagType;
1062   cudaError_t                       cerr;
1063   PetscErrorCode                    ierr;
1064 
1065   PetscFunctionBegin;
1066   /* allocate space for the transpose of the lower triangular factor */
1067   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1068   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1069 
1070   /* set the matrix descriptors of the lower triangular factor */
1071   matrixType = cusparseGetMatType(loTriFactor->descr);
1072   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1073   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1074     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1075   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1076 
1077   /* Create the matrix description */
1078   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1079   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1080   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1081   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1082   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1083 
1084   /* set the operation */
1085   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1086 
1087   /* allocate GPU space for the CSC of the lower triangular factor*/
1088   loTriFactorT->csrMat = new CsrMatrix;
1089   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1090   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1091   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1092   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1093   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1094   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1095 
1096   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1098   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1099                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1100                                        loTriFactor->csrMat->values->data().get(),
1101                                        loTriFactor->csrMat->row_offsets->data().get(),
1102                                        loTriFactor->csrMat->column_indices->data().get(),
1103                                        loTriFactorT->csrMat->values->data().get(),
1104                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1105                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1106                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1107   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1108 #endif
1109 
1110   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1111   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1112                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1113                           loTriFactor->csrMat->values->data().get(),
1114                           loTriFactor->csrMat->row_offsets->data().get(),
1115                           loTriFactor->csrMat->column_indices->data().get(),
1116                           loTriFactorT->csrMat->values->data().get(),
1117                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1118                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1119                           CUSPARSE_ACTION_NUMERIC, indexBase,
1120                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1121                         #else
1122                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1123                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1124                         #endif
1125   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1126   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1127 
1128   /* Create the solve analysis information */
1129   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1130   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1131 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1132   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1133                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1134                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1135                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1136                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1137   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1138 #endif
1139 
1140   /* perform the solve analysis */
1141   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1142                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1143                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1144                            loTriFactorT->csrMat->column_indices->data().get(),
1145                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1146                            loTriFactorT->solveInfo,
1147                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1148                           #else
1149                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1150                           #endif
1151   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1152   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1153 
1154   /* assign the pointer */
1155   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1156 
1157   /*********************************************/
1158   /* Now the Transpose of the Upper Tri Factor */
1159   /*********************************************/
1160 
1161   /* allocate space for the transpose of the upper triangular factor */
1162   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1163   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1164 
1165   /* set the matrix descriptors of the upper triangular factor */
1166   matrixType = cusparseGetMatType(upTriFactor->descr);
1167   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1168   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1169     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1170   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1171 
1172   /* Create the matrix description */
1173   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1174   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1175   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1176   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1177   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1178 
1179   /* set the operation */
1180   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1181 
1182   /* allocate GPU space for the CSC of the upper triangular factor*/
1183   upTriFactorT->csrMat = new CsrMatrix;
1184   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1185   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1186   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1187   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1188   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1189   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1190 
1191   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1192 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1193   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1194                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1195                                 upTriFactor->csrMat->values->data().get(),
1196                                 upTriFactor->csrMat->row_offsets->data().get(),
1197                                 upTriFactor->csrMat->column_indices->data().get(),
1198                                 upTriFactorT->csrMat->values->data().get(),
1199                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1200                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1201                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1202   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1203 #endif
1204 
1205   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1206   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1207                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1208                           upTriFactor->csrMat->values->data().get(),
1209                           upTriFactor->csrMat->row_offsets->data().get(),
1210                           upTriFactor->csrMat->column_indices->data().get(),
1211                           upTriFactorT->csrMat->values->data().get(),
1212                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1213                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1214                           CUSPARSE_ACTION_NUMERIC, indexBase,
1215                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1216                         #else
1217                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1218                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1219                         #endif
1220 
1221   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1222   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1223 
1224   /* Create the solve analysis information */
1225   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1226   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1227   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1228   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1229                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1230                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1231                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1232                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1233   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1234   #endif
1235 
1236   /* perform the solve analysis */
1237   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1238                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1239                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1240                            upTriFactorT->csrMat->column_indices->data().get(),
1241                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1242                            upTriFactorT->solveInfo,
1243                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1244                           #else
1245                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1246                           #endif
1247 
1248   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1249   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1250 
1251   /* assign the pointer */
1252   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 struct PetscScalarToPetscInt
1257 {
1258   __host__ __device__
1259   PetscInt operator()(PetscScalar s)
1260   {
1261     return (PetscInt)PetscRealPart(s);
1262   }
1263 };
1264 
1265 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1266 {
1267   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1268   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1269   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1270   cusparseStatus_t             stat;
1271   cusparseIndexBase_t          indexBase;
1272   cudaError_t                  err;
1273   PetscErrorCode               ierr;
1274 
1275   PetscFunctionBegin;
1276   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1277   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1278   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1279   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1280   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1281   if (A->transupdated) PetscFunctionReturn(0);
1282   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1283   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1284   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1285     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1286   }
1287   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1288     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1289     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1290     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1291     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1292     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1293 
1294     /* set alpha and beta */
1295     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1296     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1297     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1298     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1299     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1300     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1301 
1302     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1303       CsrMatrix *matrixT = new CsrMatrix;
1304       matstructT->mat = matrixT;
1305       matrixT->num_rows = A->cmap->n;
1306       matrixT->num_cols = A->rmap->n;
1307       matrixT->num_entries = a->nz;
1308       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1309       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1310       matrixT->values = new THRUSTARRAY(a->nz);
1311 
1312       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1313       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1314 
1315      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1316       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1317         stat = cusparseCreateCsr(&matstructT->matDescr,
1318                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1319                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1320                                matrixT->values->data().get(),
1321                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1322                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1323       #else
1324         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1325            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1326 
1327            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1328            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1329            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1330         */
1331         if (matrixT->num_entries) {
1332           stat = cusparseCreateCsr(&matstructT->matDescr,
1333                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1334                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1335                                  matrixT->values->data().get(),
1336                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1337                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1338 
1339         } else {
1340           matstructT->matDescr = NULL;
1341           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1342         }
1343       #endif
1344      #endif
1345     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1346    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1347       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1348    #else
1349       CsrMatrix *temp  = new CsrMatrix;
1350       CsrMatrix *tempT = new CsrMatrix;
1351       /* First convert HYB to CSR */
1352       temp->num_rows = A->rmap->n;
1353       temp->num_cols = A->cmap->n;
1354       temp->num_entries = a->nz;
1355       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1356       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1357       temp->values = new THRUSTARRAY(a->nz);
1358 
1359       stat = cusparse_hyb2csr(cusparsestruct->handle,
1360                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1361                               temp->values->data().get(),
1362                               temp->row_offsets->data().get(),
1363                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1364 
1365       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1366       tempT->num_rows = A->rmap->n;
1367       tempT->num_cols = A->cmap->n;
1368       tempT->num_entries = a->nz;
1369       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1370       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1371       tempT->values = new THRUSTARRAY(a->nz);
1372 
1373       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1374                               temp->num_cols, temp->num_entries,
1375                               temp->values->data().get(),
1376                               temp->row_offsets->data().get(),
1377                               temp->column_indices->data().get(),
1378                               tempT->values->data().get(),
1379                               tempT->column_indices->data().get(),
1380                               tempT->row_offsets->data().get(),
1381                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1382 
1383       /* Last, convert CSC to HYB */
1384       cusparseHybMat_t hybMat;
1385       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1386       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1387         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1388       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1389                               matstructT->descr, tempT->values->data().get(),
1390                               tempT->row_offsets->data().get(),
1391                               tempT->column_indices->data().get(),
1392                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1393 
1394       /* assign the pointer */
1395       matstructT->mat = hybMat;
1396       A->transupdated = PETSC_TRUE;
1397       /* delete temporaries */
1398       if (tempT) {
1399         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1400         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1401         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1402         delete (CsrMatrix*) tempT;
1403       }
1404       if (temp) {
1405         if (temp->values) delete (THRUSTARRAY*) temp->values;
1406         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1407         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1408         delete (CsrMatrix*) temp;
1409       }
1410      #endif
1411     }
1412   }
1413   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1414     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1415     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1416     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1417     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1418     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1419     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1420     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1421     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1422     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1423     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1424     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1425       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1426       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1427       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1428     }
1429     if (!cusparsestruct->csr2csc_i) {
1430       THRUSTARRAY csr2csc_a(matrix->num_entries);
1431       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1432 
1433       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1434      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1435       void   *csr2cscBuffer;
1436       size_t csr2cscBufferSize;
1437       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1438                                            A->cmap->n, matrix->num_entries,
1439                                            matrix->values->data().get(),
1440                                            cusparsestruct->rowoffsets_gpu->data().get(),
1441                                            matrix->column_indices->data().get(),
1442                                            matrixT->values->data().get(),
1443                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1444                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1445                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1446       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1447      #endif
1448 
1449       if (matrix->num_entries) {
1450         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1451            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1452            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1453 
1454            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1455            should be filled with indexBase. So I just take a shortcut here.
1456         */
1457         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1458                               A->cmap->n,matrix->num_entries,
1459                               csr2csc_a.data().get(),
1460                               cusparsestruct->rowoffsets_gpu->data().get(),
1461                               matrix->column_indices->data().get(),
1462                               matrixT->values->data().get(),
1463                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1464                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1465                               CUSPARSE_ACTION_NUMERIC,indexBase,
1466                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1467                              #else
1468                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1469                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1470                              #endif
1471       } else {
1472         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1473       }
1474 
1475       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1476       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1477      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1478       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1479      #endif
1480     }
1481     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1482                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1483                                                      matrixT->values->begin()));
1484   }
1485   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1486   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1487   /* the compressed row indices is not used for matTranspose */
1488   matstructT->cprowIndices = NULL;
1489   /* assign the pointer */
1490   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1491   A->transupdated = PETSC_TRUE;
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1496 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1497 {
1498   PetscInt                              n = xx->map->n;
1499   const PetscScalar                     *barray;
1500   PetscScalar                           *xarray;
1501   thrust::device_ptr<const PetscScalar> bGPU;
1502   thrust::device_ptr<PetscScalar>       xGPU;
1503   cusparseStatus_t                      stat;
1504   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1505   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1506   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1507   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1508   PetscErrorCode                        ierr;
1509 
1510   PetscFunctionBegin;
1511   /* Analyze the matrix and create the transpose ... on the fly */
1512   if (!loTriFactorT && !upTriFactorT) {
1513     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1514     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1515     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1516   }
1517 
1518   /* Get the GPU pointers */
1519   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1520   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1521   xGPU = thrust::device_pointer_cast(xarray);
1522   bGPU = thrust::device_pointer_cast(barray);
1523 
1524   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1525   /* First, reorder with the row permutation */
1526   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1527                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1528                xGPU);
1529 
1530   /* First, solve U */
1531   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1532                         upTriFactorT->csrMat->num_rows,
1533                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534                         upTriFactorT->csrMat->num_entries,
1535                       #endif
1536                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1537                         upTriFactorT->csrMat->values->data().get(),
1538                         upTriFactorT->csrMat->row_offsets->data().get(),
1539                         upTriFactorT->csrMat->column_indices->data().get(),
1540                         upTriFactorT->solveInfo,
1541                         xarray,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         tempGPU->data().get(),
1544                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1545                       #else
1546                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1547                       #endif
1548 
1549   /* Then, solve L */
1550   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551                         loTriFactorT->csrMat->num_rows,
1552                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553                         loTriFactorT->csrMat->num_entries,
1554                       #endif
1555                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556                         loTriFactorT->csrMat->values->data().get(),
1557                         loTriFactorT->csrMat->row_offsets->data().get(),
1558                         loTriFactorT->csrMat->column_indices->data().get(),
1559                         loTriFactorT->solveInfo,
1560                         tempGPU->data().get(),
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         xarray,
1563                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1564                       #else
1565                          xarray);CHKERRCUSPARSE(stat);
1566                       #endif
1567 
1568   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1569   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1570                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1571                tempGPU->begin());
1572 
1573   /* Copy the temporary to the full solution. */
1574   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1575 
1576   /* restore */
1577   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1578   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1579   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1580   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1585 {
1586   const PetscScalar                 *barray;
1587   PetscScalar                       *xarray;
1588   cusparseStatus_t                  stat;
1589   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1590   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1591   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1592   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1593   PetscErrorCode                    ierr;
1594 
1595   PetscFunctionBegin;
1596   /* Analyze the matrix and create the transpose ... on the fly */
1597   if (!loTriFactorT && !upTriFactorT) {
1598     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1599     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1600     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1601   }
1602 
1603   /* Get the GPU pointers */
1604   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1605   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1606 
1607   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1608   /* First, solve U */
1609   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1610                         upTriFactorT->csrMat->num_rows,
1611                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612                         upTriFactorT->csrMat->num_entries,
1613                       #endif
1614                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1615                         upTriFactorT->csrMat->values->data().get(),
1616                         upTriFactorT->csrMat->row_offsets->data().get(),
1617                         upTriFactorT->csrMat->column_indices->data().get(),
1618                         upTriFactorT->solveInfo,
1619                         barray,
1620                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1621                         tempGPU->data().get(),
1622                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1623                       #else
1624                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1625                       #endif
1626 
1627   /* Then, solve L */
1628   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1629                         loTriFactorT->csrMat->num_rows,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         loTriFactorT->csrMat->num_entries,
1632                       #endif
1633                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1634                         loTriFactorT->csrMat->values->data().get(),
1635                         loTriFactorT->csrMat->row_offsets->data().get(),
1636                         loTriFactorT->csrMat->column_indices->data().get(),
1637                         loTriFactorT->solveInfo,
1638                         tempGPU->data().get(),
1639                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1640                         xarray,
1641                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1642                       #else
1643                         xarray);CHKERRCUSPARSE(stat);
1644                       #endif
1645 
1646   /* restore */
1647   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1648   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1649   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1650   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1655 {
1656   const PetscScalar                     *barray;
1657   PetscScalar                           *xarray;
1658   thrust::device_ptr<const PetscScalar> bGPU;
1659   thrust::device_ptr<PetscScalar>       xGPU;
1660   cusparseStatus_t                      stat;
1661   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1662   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1663   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1664   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1665   PetscErrorCode                        ierr;
1666 
1667   PetscFunctionBegin;
1668 
1669   /* Get the GPU pointers */
1670   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1671   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1672   xGPU = thrust::device_pointer_cast(xarray);
1673   bGPU = thrust::device_pointer_cast(barray);
1674 
1675   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1676   /* First, reorder with the row permutation */
1677   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1678                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1679                tempGPU->begin());
1680 
1681   /* Next, solve L */
1682   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1683                         loTriFactor->csrMat->num_rows,
1684                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1685                         loTriFactor->csrMat->num_entries,
1686                       #endif
1687                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1688                         loTriFactor->csrMat->values->data().get(),
1689                         loTriFactor->csrMat->row_offsets->data().get(),
1690                         loTriFactor->csrMat->column_indices->data().get(),
1691                         loTriFactor->solveInfo,
1692                         tempGPU->data().get(),
1693                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694                          xarray,
1695                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1696                       #else
1697                          xarray);CHKERRCUSPARSE(stat);
1698                       #endif
1699 
1700   /* Then, solve U */
1701   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1702                         upTriFactor->csrMat->num_rows,
1703                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1704                         upTriFactor->csrMat->num_entries,
1705                       #endif
1706                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1707                         upTriFactor->csrMat->values->data().get(),
1708                         upTriFactor->csrMat->row_offsets->data().get(),
1709                         upTriFactor->csrMat->column_indices->data().get(),
1710                         upTriFactor->solveInfo,xarray,
1711                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1712                         tempGPU->data().get(),
1713                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1714                       #else
1715                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1716                       #endif
1717 
1718   /* Last, reorder with the column permutation */
1719   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1720                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1721                xGPU);
1722 
1723   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1724   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1725   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1726   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1727   PetscFunctionReturn(0);
1728 }
1729 
1730 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1731 {
1732   const PetscScalar                 *barray;
1733   PetscScalar                       *xarray;
1734   cusparseStatus_t                  stat;
1735   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1738   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1739   PetscErrorCode                    ierr;
1740 
1741   PetscFunctionBegin;
1742   /* Get the GPU pointers */
1743   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1744   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1745 
1746   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1747   /* First, solve L */
1748   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1749                         loTriFactor->csrMat->num_rows,
1750                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1751                         loTriFactor->csrMat->num_entries,
1752                       #endif
1753                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1754                         loTriFactor->csrMat->values->data().get(),
1755                         loTriFactor->csrMat->row_offsets->data().get(),
1756                         loTriFactor->csrMat->column_indices->data().get(),
1757                         loTriFactor->solveInfo,
1758                         barray,
1759                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1760                         tempGPU->data().get(),
1761                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1762                       #else
1763                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1764                       #endif
1765 
1766   /* Next, solve U */
1767   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1768                         upTriFactor->csrMat->num_rows,
1769                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1770                         upTriFactor->csrMat->num_entries,
1771                       #endif
1772                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1773                         upTriFactor->csrMat->values->data().get(),
1774                         upTriFactor->csrMat->row_offsets->data().get(),
1775                         upTriFactor->csrMat->column_indices->data().get(),
1776                         upTriFactor->solveInfo,
1777                         tempGPU->data().get(),
1778                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1779                         xarray,
1780                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1781                       #else
1782                         xarray);CHKERRCUSPARSE(stat);
1783                       #endif
1784 
1785   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1786   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1787   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1788   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1793 {
1794   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1795   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1796   cudaError_t        cerr;
1797   PetscErrorCode     ierr;
1798 
1799   PetscFunctionBegin;
1800   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1801     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1802 
1803     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1804     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1805     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1806     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1807     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1808     A->offloadmask = PETSC_OFFLOAD_BOTH;
1809   }
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1814 {
1815   PetscErrorCode ierr;
1816 
1817   PetscFunctionBegin;
1818   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1819   *array = ((Mat_SeqAIJ*)A->data)->a;
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1824 {
1825   PetscFunctionBegin;
1826   A->offloadmask = PETSC_OFFLOAD_CPU;
1827   *array         = NULL;
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1832 {
1833   PetscErrorCode ierr;
1834 
1835   PetscFunctionBegin;
1836   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1837   *array = ((Mat_SeqAIJ*)A->data)->a;
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1842 {
1843   PetscFunctionBegin;
1844   *array = NULL;
1845   PetscFunctionReturn(0);
1846 }
1847 
1848 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1849 {
1850   PetscFunctionBegin;
1851   *array = ((Mat_SeqAIJ*)A->data)->a;
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1856 {
1857   PetscFunctionBegin;
1858   A->offloadmask = PETSC_OFFLOAD_CPU;
1859   *array         = NULL;
1860   PetscFunctionReturn(0);
1861 }
1862 
1863 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1864 {
1865   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1866   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1867   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1868   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1869   PetscErrorCode               ierr;
1870   cusparseStatus_t             stat;
1871   PetscBool                    both = PETSC_TRUE;
1872   cudaError_t                  err;
1873 
1874   PetscFunctionBegin;
1875   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1876   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1877     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1878       CsrMatrix *matrix;
1879       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1880 
1881       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1882       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1883       matrix->values->assign(a->a, a->a+a->nz);
1884       err  = WaitForCUDA();CHKERRCUDA(err);
1885       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1886       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1887       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1888     } else {
1889       PetscInt nnz;
1890       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1891       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1892       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1893       delete cusparsestruct->workVector;
1894       delete cusparsestruct->rowoffsets_gpu;
1895       cusparsestruct->workVector = NULL;
1896       cusparsestruct->rowoffsets_gpu = NULL;
1897       try {
1898         if (a->compressedrow.use) {
1899           m    = a->compressedrow.nrows;
1900           ii   = a->compressedrow.i;
1901           ridx = a->compressedrow.rindex;
1902         } else {
1903           m    = A->rmap->n;
1904           ii   = a->i;
1905           ridx = NULL;
1906         }
1907         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1908         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1909         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1910         else nnz = a->nz;
1911 
1912         /* create cusparse matrix */
1913         cusparsestruct->nrows = m;
1914         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1915         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1916         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1917         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1918 
1919         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1920         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1921         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1922         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1923         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1924         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1925         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1926 
1927         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1928         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1929           /* set the matrix */
1930           CsrMatrix *mat= new CsrMatrix;
1931           mat->num_rows = m;
1932           mat->num_cols = A->cmap->n;
1933           mat->num_entries = nnz;
1934           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1935           mat->row_offsets->assign(ii, ii + m+1);
1936 
1937           mat->column_indices = new THRUSTINTARRAY32(nnz);
1938           mat->column_indices->assign(a->j, a->j+nnz);
1939 
1940           mat->values = new THRUSTARRAY(nnz);
1941           if (a->a) mat->values->assign(a->a, a->a+nnz);
1942 
1943           /* assign the pointer */
1944           matstruct->mat = mat;
1945          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1946           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1947             stat = cusparseCreateCsr(&matstruct->matDescr,
1948                                     mat->num_rows, mat->num_cols, mat->num_entries,
1949                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1950                                     mat->values->data().get(),
1951                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1952                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1953           }
1954          #endif
1955         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1956          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1957           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1958          #else
1959           CsrMatrix *mat= new CsrMatrix;
1960           mat->num_rows = m;
1961           mat->num_cols = A->cmap->n;
1962           mat->num_entries = nnz;
1963           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1964           mat->row_offsets->assign(ii, ii + m+1);
1965 
1966           mat->column_indices = new THRUSTINTARRAY32(nnz);
1967           mat->column_indices->assign(a->j, a->j+nnz);
1968 
1969           mat->values = new THRUSTARRAY(nnz);
1970           if (a->a) mat->values->assign(a->a, a->a+nnz);
1971 
1972           cusparseHybMat_t hybMat;
1973           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1974           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1975             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1976           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1977               matstruct->descr, mat->values->data().get(),
1978               mat->row_offsets->data().get(),
1979               mat->column_indices->data().get(),
1980               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1981           /* assign the pointer */
1982           matstruct->mat = hybMat;
1983 
1984           if (mat) {
1985             if (mat->values) delete (THRUSTARRAY*)mat->values;
1986             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1987             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1988             delete (CsrMatrix*)mat;
1989           }
1990          #endif
1991         }
1992 
1993         /* assign the compressed row indices */
1994         if (a->compressedrow.use) {
1995           cusparsestruct->workVector = new THRUSTARRAY(m);
1996           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1997           matstruct->cprowIndices->assign(ridx,ridx+m);
1998           tmp = m;
1999         } else {
2000           cusparsestruct->workVector = NULL;
2001           matstruct->cprowIndices    = NULL;
2002           tmp = 0;
2003         }
2004         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2005 
2006         /* assign the pointer */
2007         cusparsestruct->mat = matstruct;
2008       } catch(char *ex) {
2009         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2010       }
2011       err  = WaitForCUDA();CHKERRCUDA(err);
2012       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
2013       cusparsestruct->nonzerostate = A->nonzerostate;
2014     }
2015     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2016   }
2017   PetscFunctionReturn(0);
2018 }
2019 
2020 struct VecCUDAPlusEquals
2021 {
2022   template <typename Tuple>
2023   __host__ __device__
2024   void operator()(Tuple t)
2025   {
2026     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2027   }
2028 };
2029 
2030 struct VecCUDAEquals
2031 {
2032   template <typename Tuple>
2033   __host__ __device__
2034   void operator()(Tuple t)
2035   {
2036     thrust::get<1>(t) = thrust::get<0>(t);
2037   }
2038 };
2039 
2040 struct VecCUDAEqualsReverse
2041 {
2042   template <typename Tuple>
2043   __host__ __device__
2044   void operator()(Tuple t)
2045   {
2046     thrust::get<0>(t) = thrust::get<1>(t);
2047   }
2048 };
2049 
2050 struct MatMatCusparse {
2051   PetscBool             cisdense;
2052   PetscScalar           *Bt;
2053   Mat                   X;
2054   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2055   PetscLogDouble        flops;
2056   CsrMatrix             *Bcsr;
2057 
2058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2059   cusparseSpMatDescr_t  matSpBDescr;
2060   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2061   cusparseDnMatDescr_t  matBDescr;
2062   cusparseDnMatDescr_t  matCDescr;
2063   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2064  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2065   void                  *dBuffer4;
2066   void                  *dBuffer5;
2067  #endif
2068   size_t                mmBufferSize;
2069   void                  *mmBuffer;
2070   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2071   cusparseSpGEMMDescr_t spgemmDesc;
2072 #endif
2073 };
2074 
2075 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2076 {
2077   PetscErrorCode   ierr;
2078   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2079   cudaError_t      cerr;
2080  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2081   cusparseStatus_t stat;
2082  #endif
2083 
2084   PetscFunctionBegin;
2085   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2086   delete mmdata->Bcsr;
2087  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2089   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2090   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2091   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2092  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2093   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2094   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2095  #endif
2096   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2097   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2098  #endif
2099   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2100   ierr = PetscFree(data);CHKERRQ(ierr);
2101   PetscFunctionReturn(0);
2102 }
2103 
2104 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2105 
2106 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2107 {
2108   Mat_Product                  *product = C->product;
2109   Mat                          A,B;
2110   PetscInt                     m,n,blda,clda;
2111   PetscBool                    flg,biscuda;
2112   Mat_SeqAIJCUSPARSE           *cusp;
2113   cusparseStatus_t             stat;
2114   cusparseOperation_t          opA;
2115   const PetscScalar            *barray;
2116   PetscScalar                  *carray;
2117   PetscErrorCode               ierr;
2118   MatMatCusparse               *mmdata;
2119   Mat_SeqAIJCUSPARSEMultStruct *mat;
2120   CsrMatrix                    *csrmat;
2121 
2122   PetscFunctionBegin;
2123   MatCheckProduct(C,1);
2124   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2125   mmdata = (MatMatCusparse*)product->data;
2126   A    = product->A;
2127   B    = product->B;
2128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2129   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2130   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2131      Instead of silently accepting the wrong answer, I prefer to raise the error */
2132   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2133   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2134   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2135   switch (product->type) {
2136   case MATPRODUCT_AB:
2137   case MATPRODUCT_PtAP:
2138     mat = cusp->mat;
2139     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2140     m   = A->rmap->n;
2141     n   = B->cmap->n;
2142     break;
2143   case MATPRODUCT_AtB:
2144     if (!A->form_explicit_transpose) {
2145       mat = cusp->mat;
2146       opA = CUSPARSE_OPERATION_TRANSPOSE;
2147     } else {
2148       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2149       mat  = cusp->matTranspose;
2150       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2151     }
2152     m = A->cmap->n;
2153     n = B->cmap->n;
2154     break;
2155   case MATPRODUCT_ABt:
2156   case MATPRODUCT_RARt:
2157     mat = cusp->mat;
2158     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2159     m   = A->rmap->n;
2160     n   = B->rmap->n;
2161     break;
2162   default:
2163     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2164   }
2165   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2166   csrmat = (CsrMatrix*)mat->mat;
2167   /* if the user passed a CPU matrix, copy the data to the GPU */
2168   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2169   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2170   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2171 
2172   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2173   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2174     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2175     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2176   } else {
2177     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2178     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2179   }
2180 
2181   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2182  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2183   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2184   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2185   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2186     size_t mmBufferSize;
2187     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2188     if (!mmdata->matBDescr) {
2189       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2190       mmdata->Blda = blda;
2191     }
2192 
2193     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2194     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2195       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2196       mmdata->Clda = clda;
2197     }
2198 
2199     if (!mat->matDescr) {
2200       stat = cusparseCreateCsr(&mat->matDescr,
2201                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2202                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2203                                csrmat->values->data().get(),
2204                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2205                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2206     }
2207     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2208                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2209                                    mmdata->matCDescr,cusparse_scalartype,
2210                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2211     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2212       cudaError_t cerr;
2213       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2214       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2215       mmdata->mmBufferSize = mmBufferSize;
2216     }
2217     mmdata->initialized = PETSC_TRUE;
2218   } else {
2219     /* to be safe, always update pointers of the mats */
2220     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2221     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2222     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2223   }
2224 
2225   /* do cusparseSpMM, which supports transpose on B */
2226   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2227                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2228                       mmdata->matCDescr,cusparse_scalartype,
2229                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2230  #else
2231   PetscInt k;
2232   /* cusparseXcsrmm does not support transpose on B */
2233   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2234     cublasHandle_t cublasv2handle;
2235     cublasStatus_t cerr;
2236 
2237     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2238     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2239                        B->cmap->n,B->rmap->n,
2240                        &PETSC_CUSPARSE_ONE ,barray,blda,
2241                        &PETSC_CUSPARSE_ZERO,barray,blda,
2242                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2243     blda = B->cmap->n;
2244     k    = B->cmap->n;
2245   } else {
2246     k    = B->rmap->n;
2247   }
2248 
2249   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2250   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2251                            csrmat->num_entries,mat->alpha_one,mat->descr,
2252                            csrmat->values->data().get(),
2253                            csrmat->row_offsets->data().get(),
2254                            csrmat->column_indices->data().get(),
2255                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2256                            carray,clda);CHKERRCUSPARSE(stat);
2257  #endif
2258   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2259   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2260   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2261   if (product->type == MATPRODUCT_RARt) {
2262     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2263     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2264   } else if (product->type == MATPRODUCT_PtAP) {
2265     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2266     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2267   } else {
2268     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2269   }
2270   if (mmdata->cisdense) {
2271     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2272   }
2273   if (!biscuda) {
2274     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2275   }
2276   PetscFunctionReturn(0);
2277 }
2278 
2279 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2280 {
2281   Mat_Product        *product = C->product;
2282   Mat                A,B;
2283   PetscInt           m,n;
2284   PetscBool          cisdense,flg;
2285   PetscErrorCode     ierr;
2286   MatMatCusparse     *mmdata;
2287   Mat_SeqAIJCUSPARSE *cusp;
2288 
2289   PetscFunctionBegin;
2290   MatCheckProduct(C,1);
2291   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2292   A    = product->A;
2293   B    = product->B;
2294   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2295   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2296   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2297   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2298   switch (product->type) {
2299   case MATPRODUCT_AB:
2300     m = A->rmap->n;
2301     n = B->cmap->n;
2302     break;
2303   case MATPRODUCT_AtB:
2304     m = A->cmap->n;
2305     n = B->cmap->n;
2306     break;
2307   case MATPRODUCT_ABt:
2308     m = A->rmap->n;
2309     n = B->rmap->n;
2310     break;
2311   case MATPRODUCT_PtAP:
2312     m = B->cmap->n;
2313     n = B->cmap->n;
2314     break;
2315   case MATPRODUCT_RARt:
2316     m = B->rmap->n;
2317     n = B->rmap->n;
2318     break;
2319   default:
2320     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2321   }
2322   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2323   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2324   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2325   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2326 
2327   /* product data */
2328   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2329   mmdata->cisdense = cisdense;
2330  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2331   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2332   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2333     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2334   }
2335  #endif
2336   /* for these products we need intermediate storage */
2337   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2338     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2339     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2340     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2341       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2342     } else {
2343       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2344     }
2345   }
2346   C->product->data    = mmdata;
2347   C->product->destroy = MatDestroy_MatMatCusparse;
2348 
2349   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2350   PetscFunctionReturn(0);
2351 }
2352 
2353 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2354 {
2355   Mat_Product                  *product = C->product;
2356   Mat                          A,B;
2357   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2358   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2359   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2360   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2361   PetscBool                    flg;
2362   PetscErrorCode               ierr;
2363   cusparseStatus_t             stat;
2364   cudaError_t                  cerr;
2365   MatProductType               ptype;
2366   MatMatCusparse               *mmdata;
2367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2368   cusparseSpMatDescr_t         BmatSpDescr;
2369 #endif
2370   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2371 
2372   PetscFunctionBegin;
2373   MatCheckProduct(C,1);
2374   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2375   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2376   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2377   mmdata = (MatMatCusparse*)C->product->data;
2378   A = product->A;
2379   B = product->B;
2380   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2381     mmdata->reusesym = PETSC_FALSE;
2382     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2383     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2384     Cmat = Ccusp->mat;
2385     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2386     Ccsr = (CsrMatrix*)Cmat->mat;
2387     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2388     goto finalize;
2389   }
2390   if (!c->nz) goto finalize;
2391   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2392   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2393   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2394   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2395   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2396   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2397   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2398   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2399   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2400   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2401   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2402   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2404   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2405 
2406   ptype = product->type;
2407   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2408     ptype = MATPRODUCT_AB;
2409     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2410   }
2411   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2412     ptype = MATPRODUCT_AB;
2413     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2414   }
2415   switch (ptype) {
2416   case MATPRODUCT_AB:
2417     Amat = Acusp->mat;
2418     Bmat = Bcusp->mat;
2419     break;
2420   case MATPRODUCT_AtB:
2421     Amat = Acusp->matTranspose;
2422     Bmat = Bcusp->mat;
2423     break;
2424   case MATPRODUCT_ABt:
2425     Amat = Acusp->mat;
2426     Bmat = Bcusp->matTranspose;
2427     break;
2428   default:
2429     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2430   }
2431   Cmat = Ccusp->mat;
2432   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2433   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2434   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2435   Acsr = (CsrMatrix*)Amat->mat;
2436   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2437   Ccsr = (CsrMatrix*)Cmat->mat;
2438   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2439   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2440   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2441   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2443   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2444   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2445   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2446     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2447                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2448                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2449                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2450   #else
2451     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2452                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2453                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2454                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2455     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2456                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2457                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2458   #endif
2459 #else
2460   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2461                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2462                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2463                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2464                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2465 #endif
2466   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2467   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2468   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2469   C->offloadmask = PETSC_OFFLOAD_GPU;
2470 finalize:
2471   /* shorter version of MatAssemblyEnd_SeqAIJ */
2472   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2473   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2474   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2475   c->reallocs         = 0;
2476   C->info.mallocs    += 0;
2477   C->info.nz_unneeded = 0;
2478   C->assembled = C->was_assembled = PETSC_TRUE;
2479   C->num_ass++;
2480   PetscFunctionReturn(0);
2481 }
2482 
2483 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2484 {
2485   Mat_Product                  *product = C->product;
2486   Mat                          A,B;
2487   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2488   Mat_SeqAIJ                   *a,*b,*c;
2489   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2490   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2491   PetscInt                     i,j,m,n,k;
2492   PetscBool                    flg;
2493   PetscErrorCode               ierr;
2494   cusparseStatus_t             stat;
2495   cudaError_t                  cerr;
2496   MatProductType               ptype;
2497   MatMatCusparse               *mmdata;
2498   PetscLogDouble               flops;
2499   PetscBool                    biscompressed,ciscompressed;
2500 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2501   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2502   cusparseSpMatDescr_t         BmatSpDescr;
2503 #else
2504   int                          cnz;
2505 #endif
2506   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2507 
2508   PetscFunctionBegin;
2509   MatCheckProduct(C,1);
2510   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2511   A    = product->A;
2512   B    = product->B;
2513   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2514   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2515   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2516   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2517   a = (Mat_SeqAIJ*)A->data;
2518   b = (Mat_SeqAIJ*)B->data;
2519   /* product data */
2520   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2521   C->product->data    = mmdata;
2522   C->product->destroy = MatDestroy_MatMatCusparse;
2523 
2524   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2525   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2526   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2527   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2528   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2529   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2530 
2531   ptype = product->type;
2532   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2533     ptype = MATPRODUCT_AB;
2534     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2535   }
2536   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2537     ptype = MATPRODUCT_AB;
2538     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2539   }
2540   biscompressed = PETSC_FALSE;
2541   ciscompressed = PETSC_FALSE;
2542   switch (ptype) {
2543   case MATPRODUCT_AB:
2544     m = A->rmap->n;
2545     n = B->cmap->n;
2546     k = A->cmap->n;
2547     Amat = Acusp->mat;
2548     Bmat = Bcusp->mat;
2549     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2550     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2551     break;
2552   case MATPRODUCT_AtB:
2553     m = A->cmap->n;
2554     n = B->cmap->n;
2555     k = A->rmap->n;
2556     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2557     Amat = Acusp->matTranspose;
2558     Bmat = Bcusp->mat;
2559     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2560     break;
2561   case MATPRODUCT_ABt:
2562     m = A->rmap->n;
2563     n = B->rmap->n;
2564     k = A->cmap->n;
2565     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2566     Amat = Acusp->mat;
2567     Bmat = Bcusp->matTranspose;
2568     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2569     break;
2570   default:
2571     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2572   }
2573 
2574   /* create cusparse matrix */
2575   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2576   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2577   c     = (Mat_SeqAIJ*)C->data;
2578   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2579   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2580   Ccsr  = new CsrMatrix;
2581 
2582   c->compressedrow.use = ciscompressed;
2583   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2584     c->compressedrow.nrows = a->compressedrow.nrows;
2585     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2586     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2587     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2588     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2589     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2590   } else {
2591     c->compressedrow.nrows  = 0;
2592     c->compressedrow.i      = NULL;
2593     c->compressedrow.rindex = NULL;
2594     Ccusp->workVector       = NULL;
2595     Cmat->cprowIndices      = NULL;
2596   }
2597   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2598   Ccusp->mat      = Cmat;
2599   Ccusp->mat->mat = Ccsr;
2600   Ccsr->num_rows    = Ccusp->nrows;
2601   Ccsr->num_cols    = n;
2602   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2603   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2604   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2605   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2606   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2607   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2608   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2609   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2610   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2611   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2613     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2614     c->nz = 0;
2615     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2616     Ccsr->values = new THRUSTARRAY(c->nz);
2617     goto finalizesym;
2618   }
2619 
2620   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2621   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2622   Acsr = (CsrMatrix*)Amat->mat;
2623   if (!biscompressed) {
2624     Bcsr = (CsrMatrix*)Bmat->mat;
2625 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2626     BmatSpDescr = Bmat->matDescr;
2627 #endif
2628   } else { /* we need to use row offsets for the full matrix */
2629     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2630     Bcsr = new CsrMatrix;
2631     Bcsr->num_rows       = B->rmap->n;
2632     Bcsr->num_cols       = cBcsr->num_cols;
2633     Bcsr->num_entries    = cBcsr->num_entries;
2634     Bcsr->column_indices = cBcsr->column_indices;
2635     Bcsr->values         = cBcsr->values;
2636     if (!Bcusp->rowoffsets_gpu) {
2637       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2638       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2639       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2640     }
2641     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2642     mmdata->Bcsr = Bcsr;
2643 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2644     if (Bcsr->num_rows && Bcsr->num_cols) {
2645       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2646                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2647                                Bcsr->values->data().get(),
2648                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2649                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2650     }
2651     BmatSpDescr = mmdata->matSpBDescr;
2652 #endif
2653   }
2654   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2655   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2656   /* precompute flops count */
2657   if (ptype == MATPRODUCT_AB) {
2658     for (i=0, flops = 0; i<A->rmap->n; i++) {
2659       const PetscInt st = a->i[i];
2660       const PetscInt en = a->i[i+1];
2661       for (j=st; j<en; j++) {
2662         const PetscInt brow = a->j[j];
2663         flops += 2.*(b->i[brow+1] - b->i[brow]);
2664       }
2665     }
2666   } else if (ptype == MATPRODUCT_AtB) {
2667     for (i=0, flops = 0; i<A->rmap->n; i++) {
2668       const PetscInt anzi = a->i[i+1] - a->i[i];
2669       const PetscInt bnzi = b->i[i+1] - b->i[i];
2670       flops += (2.*anzi)*bnzi;
2671     }
2672   } else { /* TODO */
2673     flops = 0.;
2674   }
2675 
2676   mmdata->flops = flops;
2677   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2678 
2679 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2680   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2681   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2682                           NULL, NULL, NULL,
2683                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2684                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2685   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2686  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2687  {
2688   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2689      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2690   */
2691   void*  dBuffer1 = NULL;
2692   void*  dBuffer2 = NULL;
2693   void*  dBuffer3 = NULL;
2694   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2695   size_t bufferSize1 = 0;
2696   size_t bufferSize2 = 0;
2697   size_t bufferSize3 = 0;
2698   size_t bufferSize4 = 0;
2699   size_t bufferSize5 = 0;
2700 
2701   /*----------------------------------------------------------------------*/
2702   /* ask bufferSize1 bytes for external memory */
2703   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2704                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2705                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2706   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2707   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2709                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2710                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2711 
2712   /*----------------------------------------------------------------------*/
2713   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2714                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2715                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2716   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2717   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2718   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2719   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2720                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2721                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2722   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2723   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2724 
2725   /*----------------------------------------------------------------------*/
2726   /* get matrix C non-zero entries C_nnz1 */
2727   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2728   c->nz = (PetscInt) C_nnz1;
2729   /* allocate matrix C */
2730   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2731   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2732   /* update matC with the new pointers */
2733   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2734                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2735 
2736   /*----------------------------------------------------------------------*/
2737   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2738                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2739                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2740   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2741   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2742                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2743                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2744   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2745   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2746                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2747                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2748                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2749   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2750  }
2751  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2752   size_t bufSize2;
2753   /* ask bufferSize bytes for external memory */
2754   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2755                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2756                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2757                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2758   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2759   /* inspect the matrices A and B to understand the memory requirement for the next step */
2760   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2761                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2762                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2763                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2764   /* ask bufferSize again bytes for external memory */
2765   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2766                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2767                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2768                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2769   /* The CUSPARSE documentation is not clear, nor the API
2770      We need both buffers to perform the operations properly!
2771      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2772      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2773      is stored in the descriptor! What a messy API... */
2774   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2775   /* compute the intermediate product of A * B */
2776   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2777                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2778                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2779                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2780   /* get matrix C non-zero entries C_nnz1 */
2781   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2782   c->nz = (PetscInt) C_nnz1;
2783   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2784   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2785   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2786   Ccsr->values = new THRUSTARRAY(c->nz);
2787   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2788   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2789                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2790   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2791                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2792                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2793  #endif
2794 #else
2795   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2796   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2797                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2798                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2799                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2800                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2801   c->nz = cnz;
2802   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2803   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2804   Ccsr->values = new THRUSTARRAY(c->nz);
2805   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2806 
2807   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2808   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2809      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2810      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2811   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2812                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2813                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2814                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2815                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2816 #endif
2817   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2818   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2819 finalizesym:
2820   c->singlemalloc = PETSC_FALSE;
2821   c->free_a       = PETSC_TRUE;
2822   c->free_ij      = PETSC_TRUE;
2823   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2824   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2825   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2826     PetscInt *d_i = c->i;
2827     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2828     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2829     ii   = *Ccsr->row_offsets;
2830     jj   = *Ccsr->column_indices;
2831     if (ciscompressed) d_i = c->compressedrow.i;
2832     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2833     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2834   } else {
2835     PetscInt *d_i = c->i;
2836     if (ciscompressed) d_i = c->compressedrow.i;
2837     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2838     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839   }
2840   if (ciscompressed) { /* need to expand host row offsets */
2841     PetscInt r = 0;
2842     c->i[0] = 0;
2843     for (k = 0; k < c->compressedrow.nrows; k++) {
2844       const PetscInt next = c->compressedrow.rindex[k];
2845       const PetscInt old = c->compressedrow.i[k];
2846       for (; r < next; r++) c->i[r+1] = old;
2847     }
2848     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2849   }
2850   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2851   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2852   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2853   c->maxnz = c->nz;
2854   c->nonzerorowcnt = 0;
2855   c->rmax = 0;
2856   for (k = 0; k < m; k++) {
2857     const PetscInt nn = c->i[k+1] - c->i[k];
2858     c->ilen[k] = c->imax[k] = nn;
2859     c->nonzerorowcnt += (PetscInt)!!nn;
2860     c->rmax = PetscMax(c->rmax,nn);
2861   }
2862   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2863   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2864   Ccsr->num_entries = c->nz;
2865 
2866   C->nonzerostate++;
2867   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2868   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2869   Ccusp->nonzerostate = C->nonzerostate;
2870   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2871   C->preallocated  = PETSC_TRUE;
2872   C->assembled     = PETSC_FALSE;
2873   C->was_assembled = PETSC_FALSE;
2874   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2875     mmdata->reusesym = PETSC_TRUE;
2876     C->offloadmask   = PETSC_OFFLOAD_GPU;
2877   }
2878   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2879   PetscFunctionReturn(0);
2880 }
2881 
2882 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2883 
2884 /* handles sparse or dense B */
2885 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2886 {
2887   Mat_Product    *product = mat->product;
2888   PetscErrorCode ierr;
2889   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2890 
2891   PetscFunctionBegin;
2892   MatCheckProduct(mat,1);
2893   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2894   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2895     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2896   }
2897   if (product->type == MATPRODUCT_ABC) {
2898     Ciscusp = PETSC_FALSE;
2899     if (!product->C->boundtocpu) {
2900       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2901     }
2902   }
2903   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2904     PetscBool usecpu = PETSC_FALSE;
2905     switch (product->type) {
2906     case MATPRODUCT_AB:
2907       if (product->api_user) {
2908         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2909         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2910         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2911       } else {
2912         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2913         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2914         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2915       }
2916       break;
2917     case MATPRODUCT_AtB:
2918       if (product->api_user) {
2919         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2920         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2921         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2922       } else {
2923         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2924         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2925         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2926       }
2927       break;
2928     case MATPRODUCT_PtAP:
2929       if (product->api_user) {
2930         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2931         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2932         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2933       } else {
2934         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2935         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2936         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2937       }
2938       break;
2939     case MATPRODUCT_RARt:
2940       if (product->api_user) {
2941         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2942         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2943         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2944       } else {
2945         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2946         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2947         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2948       }
2949       break;
2950     case MATPRODUCT_ABC:
2951       if (product->api_user) {
2952         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2953         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2954         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2955       } else {
2956         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2957         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2958         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2959       }
2960       break;
2961     default:
2962       break;
2963     }
2964     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2965   }
2966   /* dispatch */
2967   if (isdense) {
2968     switch (product->type) {
2969     case MATPRODUCT_AB:
2970     case MATPRODUCT_AtB:
2971     case MATPRODUCT_ABt:
2972     case MATPRODUCT_PtAP:
2973     case MATPRODUCT_RARt:
2974      if (product->A->boundtocpu) {
2975         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2976       } else {
2977         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2978       }
2979       break;
2980     case MATPRODUCT_ABC:
2981       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2982       break;
2983     default:
2984       break;
2985     }
2986   } else if (Biscusp && Ciscusp) {
2987     switch (product->type) {
2988     case MATPRODUCT_AB:
2989     case MATPRODUCT_AtB:
2990     case MATPRODUCT_ABt:
2991       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2992       break;
2993     case MATPRODUCT_PtAP:
2994     case MATPRODUCT_RARt:
2995     case MATPRODUCT_ABC:
2996       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2997       break;
2998     default:
2999       break;
3000     }
3001   } else { /* fallback for AIJ */
3002     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3003   }
3004   PetscFunctionReturn(0);
3005 }
3006 
3007 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3008 {
3009   PetscErrorCode ierr;
3010 
3011   PetscFunctionBegin;
3012   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3013   PetscFunctionReturn(0);
3014 }
3015 
3016 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3017 {
3018   PetscErrorCode ierr;
3019 
3020   PetscFunctionBegin;
3021   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3022   PetscFunctionReturn(0);
3023 }
3024 
3025 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3026 {
3027   PetscErrorCode ierr;
3028 
3029   PetscFunctionBegin;
3030   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3031   PetscFunctionReturn(0);
3032 }
3033 
3034 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3035 {
3036   PetscErrorCode ierr;
3037 
3038   PetscFunctionBegin;
3039   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3040   PetscFunctionReturn(0);
3041 }
3042 
3043 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3044 {
3045   PetscErrorCode ierr;
3046 
3047   PetscFunctionBegin;
3048   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3049   PetscFunctionReturn(0);
3050 }
3051 
3052 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3053 {
3054   int i = blockIdx.x*blockDim.x + threadIdx.x;
3055   if (i < n) y[idx[i]] += x[i];
3056 }
3057 
3058 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3059 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3060 {
3061   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3062   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3063   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3064   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3065   PetscErrorCode               ierr;
3066   cusparseStatus_t             stat;
3067   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3068   PetscBool                    compressed;
3069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070   PetscInt                     nx,ny;
3071 #endif
3072 
3073   PetscFunctionBegin;
3074   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3075   if (!a->nonzerorowcnt) {
3076     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3077     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3078     PetscFunctionReturn(0);
3079   }
3080   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3081   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3082   if (!trans) {
3083     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3084     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3085   } else {
3086     if (herm || !A->form_explicit_transpose) {
3087       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3088       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3089     } else {
3090       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3091       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3092     }
3093   }
3094   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3095   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3096 
3097   try {
3098     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3099     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3100     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3101 
3102     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3103     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3104       /* z = A x + beta y.
3105          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3106          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3107       */
3108       xptr = xarray;
3109       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3110       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3111      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3112       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3113           allocated to accommodate different uses. So we get the length info directly from mat.
3114        */
3115       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3116         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3117         nx = mat->num_cols;
3118         ny = mat->num_rows;
3119       }
3120      #endif
3121     } else {
3122       /* z = A^T x + beta y
3123          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3124          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3125        */
3126       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3127       dptr = zarray;
3128       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3129       if (compressed) { /* Scatter x to work vector */
3130         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3131         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3132                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3133                          VecCUDAEqualsReverse());
3134       }
3135      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3136       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3137         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3138         nx = mat->num_rows;
3139         ny = mat->num_cols;
3140       }
3141      #endif
3142     }
3143 
3144     /* csr_spmv does y = alpha op(A) x + beta y */
3145     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3146      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3147       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3148       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3149         cudaError_t cerr;
3150         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3151         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3152         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3153                                 matstruct->matDescr,
3154                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3155                                 matstruct->cuSpMV[opA].vecYDescr,
3156                                 cusparse_scalartype,
3157                                 cusparsestruct->spmvAlg,
3158                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3159         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3160 
3161         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3162       } else {
3163         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3164         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3165         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3166       }
3167 
3168       stat = cusparseSpMV(cusparsestruct->handle, opA,
3169                                matstruct->alpha_one,
3170                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3171                                matstruct->cuSpMV[opA].vecXDescr,
3172                                beta,
3173                                matstruct->cuSpMV[opA].vecYDescr,
3174                                cusparse_scalartype,
3175                                cusparsestruct->spmvAlg,
3176                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3177      #else
3178       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3179       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3180                                mat->num_rows, mat->num_cols,
3181                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3182                                mat->values->data().get(), mat->row_offsets->data().get(),
3183                                mat->column_indices->data().get(), xptr, beta,
3184                                dptr);CHKERRCUSPARSE(stat);
3185      #endif
3186     } else {
3187       if (cusparsestruct->nrows) {
3188        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3189         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3190        #else
3191         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3192         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3193                                  matstruct->alpha_one, matstruct->descr, hybMat,
3194                                  xptr, beta,
3195                                  dptr);CHKERRCUSPARSE(stat);
3196        #endif
3197       }
3198     }
3199     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3200 
3201     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3202       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3203         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3204           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3205         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3206           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3207         }
3208       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3209         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3210       }
3211 
3212       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3213       if (compressed) {
3214         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3215         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3216            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3217            prevent that. So I just add a ScatterAdd kernel.
3218          */
3219        #if 0
3220         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3221         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3222                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3223                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3224                          VecCUDAPlusEquals());
3225        #else
3226         PetscInt n = matstruct->cprowIndices->size();
3227         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3228        #endif
3229         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3230       }
3231     } else {
3232       if (yy && yy != zz) {
3233         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3234       }
3235     }
3236     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3237     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3238     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3239   } catch(char *ex) {
3240     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3241   }
3242   if (yy) {
3243     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3244   } else {
3245     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3246   }
3247   PetscFunctionReturn(0);
3248 }
3249 
3250 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3251 {
3252   PetscErrorCode ierr;
3253 
3254   PetscFunctionBegin;
3255   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3256   PetscFunctionReturn(0);
3257 }
3258 
3259 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3260 {
3261   PetscErrorCode     ierr;
3262   PetscObjectState   onnz = A->nonzerostate;
3263   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3264 
3265   PetscFunctionBegin;
3266   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3267   if (onnz != A->nonzerostate && cusp->deviceMat) {
3268     cudaError_t cerr;
3269 
3270     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3271     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3272     cusp->deviceMat = NULL;
3273   }
3274   PetscFunctionReturn(0);
3275 }
3276 
3277 /* --------------------------------------------------------------------------------*/
3278 /*@
3279    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3280    (the default parallel PETSc format). This matrix will ultimately pushed down
3281    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3282    assembly performance the user should preallocate the matrix storage by setting
3283    the parameter nz (or the array nnz).  By setting these parameters accurately,
3284    performance during matrix assembly can be increased by more than a factor of 50.
3285 
3286    Collective
3287 
3288    Input Parameters:
3289 +  comm - MPI communicator, set to PETSC_COMM_SELF
3290 .  m - number of rows
3291 .  n - number of columns
3292 .  nz - number of nonzeros per row (same for all rows)
3293 -  nnz - array containing the number of nonzeros in the various rows
3294          (possibly different for each row) or NULL
3295 
3296    Output Parameter:
3297 .  A - the matrix
3298 
3299    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3300    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3301    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3302 
3303    Notes:
3304    If nnz is given then nz is ignored
3305 
3306    The AIJ format (also called the Yale sparse matrix format or
3307    compressed row storage), is fully compatible with standard Fortran 77
3308    storage.  That is, the stored row and column indices can begin at
3309    either one (as in Fortran) or zero.  See the users' manual for details.
3310 
3311    Specify the preallocated storage with either nz or nnz (not both).
3312    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3313    allocation.  For large problems you MUST preallocate memory or you
3314    will get TERRIBLE performance, see the users' manual chapter on matrices.
3315 
3316    By default, this format uses inodes (identical nodes) when possible, to
3317    improve numerical efficiency of matrix-vector products and solves. We
3318    search for consecutive rows with the same nonzero structure, thereby
3319    reusing matrix information to achieve increased efficiency.
3320 
3321    Level: intermediate
3322 
3323 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3324 @*/
3325 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3326 {
3327   PetscErrorCode ierr;
3328 
3329   PetscFunctionBegin;
3330   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3331   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3332   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3333   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3334   PetscFunctionReturn(0);
3335 }
3336 
3337 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3338 {
3339   PetscErrorCode ierr;
3340 
3341   PetscFunctionBegin;
3342   if (A->factortype == MAT_FACTOR_NONE) {
3343     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3344   } else {
3345     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3346   }
3347   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3348   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3349   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3350   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3351   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3352   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3357   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3362 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3363 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3364 {
3365   PetscErrorCode ierr;
3366 
3367   PetscFunctionBegin;
3368   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3369   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3370   PetscFunctionReturn(0);
3371 }
3372 
3373 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3374 {
3375   PetscErrorCode     ierr;
3376   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3377   Mat_SeqAIJCUSPARSE *cy;
3378   Mat_SeqAIJCUSPARSE *cx;
3379   PetscScalar        *ay;
3380   const PetscScalar  *ax;
3381   CsrMatrix          *csry,*csrx;
3382 
3383   PetscFunctionBegin;
3384   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3385   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3386   if (X->ops->axpy != Y->ops->axpy) {
3387     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3388     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3389     PetscFunctionReturn(0);
3390   }
3391   /* if we are here, it means both matrices are bound to GPU */
3392   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3393   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3394   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3395   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3396   csry = (CsrMatrix*)cy->mat->mat;
3397   csrx = (CsrMatrix*)cx->mat->mat;
3398   /* see if we can turn this into a cublas axpy */
3399   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3400     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3401     if (eq) {
3402       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3403     }
3404     if (eq) str = SAME_NONZERO_PATTERN;
3405   }
3406   /* spgeam is buggy with one column */
3407   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3408 
3409   if (str == SUBSET_NONZERO_PATTERN) {
3410     cusparseStatus_t stat;
3411     PetscScalar      b = 1.0;
3412 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3413     size_t           bufferSize;
3414     void             *buffer;
3415     cudaError_t      cerr;
3416 #endif
3417 
3418     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3419     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3420     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3421 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3422     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3423                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3424                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3425                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3426     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3427     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3428     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3429                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3430                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3431                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3432     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3433     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3434     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3435 #else
3436     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3437     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3438                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3439                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3440                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3441     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3442     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3443 #endif
3444     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3445     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3446     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3447     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3448   } else if (str == SAME_NONZERO_PATTERN) {
3449     cublasHandle_t cublasv2handle;
3450     cublasStatus_t berr;
3451     PetscBLASInt   one = 1, bnz = 1;
3452 
3453     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3454     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3455     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3456     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3457     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3458     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3459     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3460     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3461     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3462     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3463     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3464   } else {
3465     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3466     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3467   }
3468   PetscFunctionReturn(0);
3469 }
3470 
3471 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3472 {
3473   PetscErrorCode ierr;
3474   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3475   PetscScalar    *ay;
3476   cublasHandle_t cublasv2handle;
3477   cublasStatus_t berr;
3478   PetscBLASInt   one = 1, bnz = 1;
3479 
3480   PetscFunctionBegin;
3481   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3482   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3483   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3484   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3485   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3486   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3487   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3488   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3489   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3490   PetscFunctionReturn(0);
3491 }
3492 
3493 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3494 {
3495   PetscErrorCode ierr;
3496   PetscBool      both = PETSC_FALSE;
3497   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3498 
3499   PetscFunctionBegin;
3500   if (A->factortype == MAT_FACTOR_NONE) {
3501     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3502     if (spptr->mat) {
3503       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3504       if (matrix->values) {
3505         both = PETSC_TRUE;
3506         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3507       }
3508     }
3509     if (spptr->matTranspose) {
3510       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3511       if (matrix->values) {
3512         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3513       }
3514     }
3515   }
3516   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3517   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3518   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3519   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3520   else A->offloadmask = PETSC_OFFLOAD_CPU;
3521   PetscFunctionReturn(0);
3522 }
3523 
3524 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3525 {
3526   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3527   PetscErrorCode ierr;
3528 
3529   PetscFunctionBegin;
3530   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3531   if (flg) {
3532     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3533 
3534     A->ops->scale                     = MatScale_SeqAIJ;
3535     A->ops->axpy                      = MatAXPY_SeqAIJ;
3536     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3537     A->ops->mult                      = MatMult_SeqAIJ;
3538     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3539     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3540     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3541     A->ops->multhermitiantranspose    = NULL;
3542     A->ops->multhermitiantransposeadd = NULL;
3543     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3544     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3545     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3546     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3547     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3548     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3549     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3550     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3551     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3552   } else {
3553     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3554     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3555     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3556     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3557     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3558     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3559     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3560     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3561     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3562     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3563     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3564     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3565     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3566     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3567     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3568     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3569     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3570     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3571     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3572     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3573     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3574     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575   }
3576   A->boundtocpu = flg;
3577   if (flg && a->inode.size) {
3578     a->inode.use = PETSC_TRUE;
3579   } else {
3580     a->inode.use = PETSC_FALSE;
3581   }
3582   PetscFunctionReturn(0);
3583 }
3584 
3585 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3586 {
3587   PetscErrorCode   ierr;
3588   cusparseStatus_t stat;
3589   Mat              B;
3590 
3591   PetscFunctionBegin;
3592   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3593   if (reuse == MAT_INITIAL_MATRIX) {
3594     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3595   } else if (reuse == MAT_REUSE_MATRIX) {
3596     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3597   }
3598   B = *newmat;
3599 
3600   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3601   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3602 
3603   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3604     if (B->factortype == MAT_FACTOR_NONE) {
3605       Mat_SeqAIJCUSPARSE *spptr;
3606       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3607       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3608       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3609       spptr->format     = MAT_CUSPARSE_CSR;
3610      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3611      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3612       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3613      #else
3614       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3615      #endif
3616       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3617       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3618      #endif
3619       B->spptr = spptr;
3620     } else {
3621       Mat_SeqAIJCUSPARSETriFactors *spptr;
3622 
3623       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3624       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3625       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3626       B->spptr = spptr;
3627     }
3628     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3629   }
3630   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3631   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3632   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3633   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3634   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3635   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3636 
3637   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3638   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3639   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3640 #if defined(PETSC_HAVE_HYPRE)
3641   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3642 #endif
3643   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
3644   PetscFunctionReturn(0);
3645 }
3646 
3647 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3648 {
3649   PetscErrorCode ierr;
3650 
3651   PetscFunctionBegin;
3652   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3653   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3654   PetscFunctionReturn(0);
3655 }
3656 
3657 /*MC
3658    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3659 
3660    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3661    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3662    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3663 
3664    Options Database Keys:
3665 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3666 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3667 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3668 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3669 
3670   Level: beginner
3671 
3672 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3673 M*/
3674 
3675 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3676 
3677 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3678 {
3679   PetscErrorCode ierr;
3680 
3681   PetscFunctionBegin;
3682   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3683   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3684   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3685   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3686   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3687 
3688   PetscFunctionReturn(0);
3689 }
3690 
3691 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3692 {
3693   PetscErrorCode   ierr;
3694   cusparseStatus_t stat;
3695 
3696   PetscFunctionBegin;
3697   if (*cusparsestruct) {
3698     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3699     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3700     delete (*cusparsestruct)->workVector;
3701     delete (*cusparsestruct)->rowoffsets_gpu;
3702     delete (*cusparsestruct)->cooPerm;
3703     delete (*cusparsestruct)->cooPerm_a;
3704     delete (*cusparsestruct)->csr2csc_i;
3705     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3706     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3707   }
3708   PetscFunctionReturn(0);
3709 }
3710 
3711 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3712 {
3713   PetscFunctionBegin;
3714   if (*mat) {
3715     delete (*mat)->values;
3716     delete (*mat)->column_indices;
3717     delete (*mat)->row_offsets;
3718     delete *mat;
3719     *mat = 0;
3720   }
3721   PetscFunctionReturn(0);
3722 }
3723 
3724 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3725 {
3726   cusparseStatus_t stat;
3727   PetscErrorCode   ierr;
3728 
3729   PetscFunctionBegin;
3730   if (*trifactor) {
3731     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3732     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3733     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3734     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3735     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3736    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3737     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3738    #endif
3739     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3740   }
3741   PetscFunctionReturn(0);
3742 }
3743 
3744 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3745 {
3746   CsrMatrix        *mat;
3747   cusparseStatus_t stat;
3748   cudaError_t      err;
3749 
3750   PetscFunctionBegin;
3751   if (*matstruct) {
3752     if ((*matstruct)->mat) {
3753       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3754        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3755         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3756        #else
3757         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3758         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3759        #endif
3760       } else {
3761         mat = (CsrMatrix*)(*matstruct)->mat;
3762         CsrMatrix_Destroy(&mat);
3763       }
3764     }
3765     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3766     delete (*matstruct)->cprowIndices;
3767     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3768     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3769     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3770 
3771    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3772     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3773     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3774     for (int i=0; i<3; i++) {
3775       if (mdata->cuSpMV[i].initialized) {
3776         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3777         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3778         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3779       }
3780     }
3781    #endif
3782     delete *matstruct;
3783     *matstruct = NULL;
3784   }
3785   PetscFunctionReturn(0);
3786 }
3787 
3788 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3789 {
3790   PetscErrorCode ierr;
3791 
3792   PetscFunctionBegin;
3793   if (*trifactors) {
3794     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3795     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3796     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3797     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3798     delete (*trifactors)->rpermIndices;
3799     delete (*trifactors)->cpermIndices;
3800     delete (*trifactors)->workVector;
3801     (*trifactors)->rpermIndices = NULL;
3802     (*trifactors)->cpermIndices = NULL;
3803     (*trifactors)->workVector = NULL;
3804     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3805     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3806     (*trifactors)->init_dev_prop = PETSC_FALSE;
3807   }
3808   PetscFunctionReturn(0);
3809 }
3810 
3811 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3812 {
3813   PetscErrorCode   ierr;
3814   cusparseHandle_t handle;
3815   cusparseStatus_t stat;
3816 
3817   PetscFunctionBegin;
3818   if (*trifactors) {
3819     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3820     if (handle = (*trifactors)->handle) {
3821       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3822     }
3823     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3824   }
3825   PetscFunctionReturn(0);
3826 }
3827 
3828 struct IJCompare
3829 {
3830   __host__ __device__
3831   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3832   {
3833     if (t1.get<0>() < t2.get<0>()) return true;
3834     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3835     return false;
3836   }
3837 };
3838 
3839 struct IJEqual
3840 {
3841   __host__ __device__
3842   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3843   {
3844     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3845     return true;
3846   }
3847 };
3848 
3849 struct IJDiff
3850 {
3851   __host__ __device__
3852   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3853   {
3854     return t1 == t2 ? 0 : 1;
3855   }
3856 };
3857 
3858 struct IJSum
3859 {
3860   __host__ __device__
3861   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3862   {
3863     return t1||t2;
3864   }
3865 };
3866 
3867 #include <thrust/iterator/discard_iterator.h>
3868 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3869 {
3870   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3871   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3872   THRUSTARRAY                           *cooPerm_v = NULL;
3873   thrust::device_ptr<const PetscScalar> d_v;
3874   CsrMatrix                             *matrix;
3875   PetscErrorCode                        ierr;
3876   PetscInt                              n;
3877 
3878   PetscFunctionBegin;
3879   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3880   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3881   if (!cusp->cooPerm) {
3882     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3883     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3884     PetscFunctionReturn(0);
3885   }
3886   matrix = (CsrMatrix*)cusp->mat->mat;
3887   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3888   if (!v) {
3889     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3890     goto finalize;
3891   }
3892   n = cusp->cooPerm->size();
3893   if (isCudaMem(v)) {
3894     d_v = thrust::device_pointer_cast(v);
3895   } else {
3896     cooPerm_v = new THRUSTARRAY(n);
3897     cooPerm_v->assign(v,v+n);
3898     d_v = cooPerm_v->data();
3899     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3900   }
3901   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3902   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3903     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3904       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3905       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3906       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3907         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3908         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3909       */
3910       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3911       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3912       delete cooPerm_w;
3913     } else {
3914       /* all nonzeros in d_v[] are unique entries */
3915       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3916                                                                 matrix->values->begin()));
3917       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3918                                                                 matrix->values->end()));
3919       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3920     }
3921   } else {
3922     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3923       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3924       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3925     } else {
3926       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3927                                                                 matrix->values->begin()));
3928       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3929                                                                 matrix->values->end()));
3930       thrust::for_each(zibit,zieit,VecCUDAEquals());
3931     }
3932   }
3933   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3934 finalize:
3935   delete cooPerm_v;
3936   A->offloadmask = PETSC_OFFLOAD_GPU;
3937   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3938   /* shorter version of MatAssemblyEnd_SeqAIJ */
3939   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3940   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3941   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3942   a->reallocs         = 0;
3943   A->info.mallocs    += 0;
3944   A->info.nz_unneeded = 0;
3945   A->assembled = A->was_assembled = PETSC_TRUE;
3946   A->num_ass++;
3947   PetscFunctionReturn(0);
3948 }
3949 
3950 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3951 {
3952   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3953   PetscErrorCode     ierr;
3954 
3955   PetscFunctionBegin;
3956   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3957   if (!cusp) PetscFunctionReturn(0);
3958   if (destroy) {
3959     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3960     delete cusp->csr2csc_i;
3961     cusp->csr2csc_i = NULL;
3962   }
3963   A->transupdated = PETSC_FALSE;
3964   PetscFunctionReturn(0);
3965 }
3966 
3967 #include <thrust/binary_search.h>
3968 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3969 {
3970   PetscErrorCode     ierr;
3971   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3972   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3973   PetscInt           cooPerm_n, nzr = 0;
3974   cudaError_t        cerr;
3975 
3976   PetscFunctionBegin;
3977   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3978   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3979   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3980   if (n != cooPerm_n) {
3981     delete cusp->cooPerm;
3982     delete cusp->cooPerm_a;
3983     cusp->cooPerm = NULL;
3984     cusp->cooPerm_a = NULL;
3985   }
3986   if (n) {
3987     THRUSTINTARRAY d_i(n);
3988     THRUSTINTARRAY d_j(n);
3989     THRUSTINTARRAY ii(A->rmap->n);
3990 
3991     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3992     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3993 
3994     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3995     d_i.assign(coo_i,coo_i+n);
3996     d_j.assign(coo_j,coo_j+n);
3997 
3998     /* Ex.
3999       n = 6
4000       coo_i = [3,3,1,4,1,4]
4001       coo_j = [3,2,2,5,2,6]
4002     */
4003     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4004     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4005 
4006     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4007     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4008     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4009     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4010     THRUSTINTARRAY w = d_j;
4011 
4012     /*
4013       d_i     = [1,1,3,3,4,4]
4014       d_j     = [2,2,2,3,5,6]
4015       cooPerm = [2,4,1,0,3,5]
4016     */
4017     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4018 
4019     /*
4020       d_i     = [1,3,3,4,4,x]
4021                             ^ekey
4022       d_j     = [2,2,3,5,6,x]
4023                            ^nekye
4024     */
4025     if (nekey == ekey) { /* all entries are unique */
4026       delete cusp->cooPerm_a;
4027       cusp->cooPerm_a = NULL;
4028     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4029       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4030       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4031       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4032       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4033       w[0] = 0;
4034       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4035       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4036     }
4037     thrust::counting_iterator<PetscInt> search_begin(0);
4038     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4039                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4040                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4041     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4042 
4043     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
4044     a->singlemalloc = PETSC_FALSE;
4045     a->free_a       = PETSC_TRUE;
4046     a->free_ij      = PETSC_TRUE;
4047     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4048     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4049     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4050     a->nz = a->maxnz = a->i[A->rmap->n];
4051     a->rmax = 0;
4052     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4053     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4054     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4055     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4056     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4057     for (PetscInt i = 0; i < A->rmap->n; i++) {
4058       const PetscInt nnzr = a->i[i+1] - a->i[i];
4059       nzr += (PetscInt)!!(nnzr);
4060       a->ilen[i] = a->imax[i] = nnzr;
4061       a->rmax = PetscMax(a->rmax,nnzr);
4062     }
4063     a->nonzerorowcnt = nzr;
4064     A->preallocated = PETSC_TRUE;
4065     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4066     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4067   } else {
4068     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4069   }
4070   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4071 
4072   /* We want to allocate the CUSPARSE struct for matvec now.
4073      The code is so convoluted now that I prefer to copy zeros */
4074   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4075   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4076   A->offloadmask = PETSC_OFFLOAD_CPU;
4077   A->nonzerostate++;
4078   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4079   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4080 
4081   A->assembled = PETSC_FALSE;
4082   A->was_assembled = PETSC_FALSE;
4083   PetscFunctionReturn(0);
4084 }
4085 
4086 /*@C
4087     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4088 
4089    Not collective
4090 
4091     Input Parameters:
4092 +   A - the matrix
4093 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4094 
4095     Output Parameters:
4096 +   ia - the CSR row pointers
4097 -   ja - the CSR column indices
4098 
4099     Level: developer
4100 
4101     Notes:
4102       When compressed is true, the CSR structure does not contain empty rows
4103 
4104 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4105 @*/
4106 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4107 {
4108   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4109   CsrMatrix          *csr;
4110   PetscErrorCode     ierr;
4111   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4112 
4113   PetscFunctionBegin;
4114   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4115   if (!i || !j) PetscFunctionReturn(0);
4116   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4117   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4118   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4119   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4120   csr = (CsrMatrix*)cusp->mat->mat;
4121   if (i) {
4122     if (!compressed && a->compressedrow.use) { /* need full row offset */
4123       if (!cusp->rowoffsets_gpu) {
4124         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4125         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4126         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4127       }
4128       *i = cusp->rowoffsets_gpu->data().get();
4129     } else *i = csr->row_offsets->data().get();
4130   }
4131   if (j) *j = csr->column_indices->data().get();
4132   PetscFunctionReturn(0);
4133 }
4134 
4135 /*@C
4136     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4137 
4138    Not collective
4139 
4140     Input Parameters:
4141 +   A - the matrix
4142 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4143 
4144     Output Parameters:
4145 +   ia - the CSR row pointers
4146 -   ja - the CSR column indices
4147 
4148     Level: developer
4149 
4150 .seealso: MatSeqAIJCUSPARSEGetIJ()
4151 @*/
4152 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4153 {
4154   PetscFunctionBegin;
4155   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4156   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4157   if (i) *i = NULL;
4158   if (j) *j = NULL;
4159   PetscFunctionReturn(0);
4160 }
4161 
4162 /*@C
4163    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4164 
4165    Not Collective
4166 
4167    Input Parameter:
4168 .   A - a MATSEQAIJCUSPARSE matrix
4169 
4170    Output Parameter:
4171 .   a - pointer to the device data
4172 
4173    Level: developer
4174 
4175    Notes: may trigger host-device copies if up-to-date matrix data is on host
4176 
4177 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4178 @*/
4179 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4180 {
4181   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4182   CsrMatrix          *csr;
4183   PetscErrorCode     ierr;
4184 
4185   PetscFunctionBegin;
4186   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4187   PetscValidPointer(a,2);
4188   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4189   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4190   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4191   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4192   csr = (CsrMatrix*)cusp->mat->mat;
4193   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4194   *a = csr->values->data().get();
4195   PetscFunctionReturn(0);
4196 }
4197 
4198 /*@C
4199    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4200 
4201    Not Collective
4202 
4203    Input Parameter:
4204 .   A - a MATSEQAIJCUSPARSE matrix
4205 
4206    Output Parameter:
4207 .   a - pointer to the device data
4208 
4209    Level: developer
4210 
4211 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4212 @*/
4213 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4214 {
4215   PetscFunctionBegin;
4216   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4217   PetscValidPointer(a,2);
4218   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4219   *a = NULL;
4220   PetscFunctionReturn(0);
4221 }
4222 
4223 /*@C
4224    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4225 
4226    Not Collective
4227 
4228    Input Parameter:
4229 .   A - a MATSEQAIJCUSPARSE matrix
4230 
4231    Output Parameter:
4232 .   a - pointer to the device data
4233 
4234    Level: developer
4235 
4236    Notes: may trigger host-device copies if up-to-date matrix data is on host
4237 
4238 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4239 @*/
4240 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4241 {
4242   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4243   CsrMatrix          *csr;
4244   PetscErrorCode     ierr;
4245 
4246   PetscFunctionBegin;
4247   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4248   PetscValidPointer(a,2);
4249   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4250   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4251   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4252   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4253   csr = (CsrMatrix*)cusp->mat->mat;
4254   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4255   *a = csr->values->data().get();
4256   A->offloadmask = PETSC_OFFLOAD_GPU;
4257   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4258   PetscFunctionReturn(0);
4259 }
4260 /*@C
4261    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4262 
4263    Not Collective
4264 
4265    Input Parameter:
4266 .   A - a MATSEQAIJCUSPARSE matrix
4267 
4268    Output Parameter:
4269 .   a - pointer to the device data
4270 
4271    Level: developer
4272 
4273 .seealso: MatSeqAIJCUSPARSEGetArray()
4274 @*/
4275 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4276 {
4277   PetscErrorCode ierr;
4278 
4279   PetscFunctionBegin;
4280   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4281   PetscValidPointer(a,2);
4282   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4283   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4284   *a = NULL;
4285   PetscFunctionReturn(0);
4286 }
4287 
4288 /*@C
4289    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4290 
4291    Not Collective
4292 
4293    Input Parameter:
4294 .   A - a MATSEQAIJCUSPARSE matrix
4295 
4296    Output Parameter:
4297 .   a - pointer to the device data
4298 
4299    Level: developer
4300 
4301    Notes: does not trigger host-device copies and flags data validity on the GPU
4302 
4303 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4304 @*/
4305 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4306 {
4307   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4308   CsrMatrix          *csr;
4309   PetscErrorCode     ierr;
4310 
4311   PetscFunctionBegin;
4312   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4313   PetscValidPointer(a,2);
4314   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4315   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4316   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4317   csr = (CsrMatrix*)cusp->mat->mat;
4318   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4319   *a = csr->values->data().get();
4320   A->offloadmask = PETSC_OFFLOAD_GPU;
4321   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4322   PetscFunctionReturn(0);
4323 }
4324 
4325 /*@C
4326    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4327 
4328    Not Collective
4329 
4330    Input Parameter:
4331 .   A - a MATSEQAIJCUSPARSE matrix
4332 
4333    Output Parameter:
4334 .   a - pointer to the device data
4335 
4336    Level: developer
4337 
4338 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4339 @*/
4340 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4341 {
4342   PetscErrorCode ierr;
4343 
4344   PetscFunctionBegin;
4345   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4346   PetscValidPointer(a,2);
4347   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4348   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4349   *a = NULL;
4350   PetscFunctionReturn(0);
4351 }
4352 
4353 struct IJCompare4
4354 {
4355   __host__ __device__
4356   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4357   {
4358     if (t1.get<0>() < t2.get<0>()) return true;
4359     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4360     return false;
4361   }
4362 };
4363 
4364 struct Shift
4365 {
4366   int _shift;
4367 
4368   Shift(int shift) : _shift(shift) {}
4369   __host__ __device__
4370   inline int operator() (const int &c)
4371   {
4372     return c + _shift;
4373   }
4374 };
4375 
4376 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4377 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4378 {
4379   PetscErrorCode               ierr;
4380   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4381   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4382   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4383   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4384   PetscInt                     Annz,Bnnz;
4385   cusparseStatus_t             stat;
4386   PetscInt                     i,m,n,zero = 0;
4387   cudaError_t                  cerr;
4388 
4389   PetscFunctionBegin;
4390   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4391   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4392   PetscValidPointer(C,4);
4393   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4394   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4395   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4396   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4397   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4398   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4399   if (reuse == MAT_INITIAL_MATRIX) {
4400     m     = A->rmap->n;
4401     n     = A->cmap->n + B->cmap->n;
4402     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4403     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4404     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4405     c     = (Mat_SeqAIJ*)(*C)->data;
4406     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4407     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4408     Ccsr  = new CsrMatrix;
4409     Cmat->cprowIndices      = NULL;
4410     c->compressedrow.use    = PETSC_FALSE;
4411     c->compressedrow.nrows  = 0;
4412     c->compressedrow.i      = NULL;
4413     c->compressedrow.rindex = NULL;
4414     Ccusp->workVector       = NULL;
4415     Ccusp->nrows    = m;
4416     Ccusp->mat      = Cmat;
4417     Ccusp->mat->mat = Ccsr;
4418     Ccsr->num_rows  = m;
4419     Ccsr->num_cols  = n;
4420     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4421     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4422     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4423     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4424     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4425     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4426     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4427     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4428     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4429     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4430     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4431     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4432     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4433 
4434     Acsr = (CsrMatrix*)Acusp->mat->mat;
4435     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4436     Annz = (PetscInt)Acsr->column_indices->size();
4437     Bnnz = (PetscInt)Bcsr->column_indices->size();
4438     c->nz = Annz + Bnnz;
4439     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4440     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4441     Ccsr->values = new THRUSTARRAY(c->nz);
4442     Ccsr->num_entries = c->nz;
4443     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4444     if (c->nz) {
4445       auto Acoo = new THRUSTINTARRAY32(Annz);
4446       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4447       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4448       THRUSTINTARRAY32 *Aroff,*Broff;
4449 
4450       if (a->compressedrow.use) { /* need full row offset */
4451         if (!Acusp->rowoffsets_gpu) {
4452           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4453           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4454           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4455         }
4456         Aroff = Acusp->rowoffsets_gpu;
4457       } else Aroff = Acsr->row_offsets;
4458       if (b->compressedrow.use) { /* need full row offset */
4459         if (!Bcusp->rowoffsets_gpu) {
4460           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4461           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4462           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4463         }
4464         Broff = Bcusp->rowoffsets_gpu;
4465       } else Broff = Bcsr->row_offsets;
4466       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4467       stat = cusparseXcsr2coo(Acusp->handle,
4468                               Aroff->data().get(),
4469                               Annz,
4470                               m,
4471                               Acoo->data().get(),
4472                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4473       stat = cusparseXcsr2coo(Bcusp->handle,
4474                               Broff->data().get(),
4475                               Bnnz,
4476                               m,
4477                               Bcoo->data().get(),
4478                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4479       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4480       auto Aperm = thrust::make_constant_iterator(1);
4481       auto Bperm = thrust::make_constant_iterator(0);
4482 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4483       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4484       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4485 #else
4486       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4487       auto Bcib = Bcsr->column_indices->begin();
4488       auto Bcie = Bcsr->column_indices->end();
4489       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4490 #endif
4491       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4492       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4493       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4494       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4495       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4496       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4497       auto p1 = Ccusp->cooPerm->begin();
4498       auto p2 = Ccusp->cooPerm->begin();
4499       thrust::advance(p2,Annz);
4500       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4501 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4502       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4503 #endif
4504       auto cci = thrust::make_counting_iterator(zero);
4505       auto cce = thrust::make_counting_iterator(c->nz);
4506 #if 0 //Errors on SUMMIT cuda 11.1.0
4507       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4508 #else
4509       auto pred = thrust::identity<int>();
4510       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4511       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4512 #endif
4513       stat = cusparseXcoo2csr(Ccusp->handle,
4514                               Ccoo->data().get(),
4515                               c->nz,
4516                               m,
4517                               Ccsr->row_offsets->data().get(),
4518                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4519       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4520       delete wPerm;
4521       delete Acoo;
4522       delete Bcoo;
4523       delete Ccoo;
4524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4525       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4526                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4527                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4528                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4529 #endif
4530       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4531         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4532         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4533         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4534         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4535         CsrMatrix *CcsrT = new CsrMatrix;
4536         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4537         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4538 
4539         (*C)->form_explicit_transpose = PETSC_TRUE;
4540         (*C)->transupdated = PETSC_TRUE;
4541         Ccusp->rowoffsets_gpu = NULL;
4542         CmatT->cprowIndices = NULL;
4543         CmatT->mat = CcsrT;
4544         CcsrT->num_rows = n;
4545         CcsrT->num_cols = m;
4546         CcsrT->num_entries = c->nz;
4547 
4548         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4549         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4550         CcsrT->values = new THRUSTARRAY(c->nz);
4551 
4552         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4553         auto rT = CcsrT->row_offsets->begin();
4554         if (AT) {
4555           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4556           thrust::advance(rT,-1);
4557         }
4558         if (BT) {
4559           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4560           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4561           thrust::copy(titb,tite,rT);
4562         }
4563         auto cT = CcsrT->column_indices->begin();
4564         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4565         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4566         auto vT = CcsrT->values->begin();
4567         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4568         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4569         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4570 
4571         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4572         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4573         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4574         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4575         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4576         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4577         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4578         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4579         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4580 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4581         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4582                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4583                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4584                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4585 #endif
4586         Ccusp->matTranspose = CmatT;
4587       }
4588     }
4589 
4590     c->singlemalloc = PETSC_FALSE;
4591     c->free_a       = PETSC_TRUE;
4592     c->free_ij      = PETSC_TRUE;
4593     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4594     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4595     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4596       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4597       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4598       ii   = *Ccsr->row_offsets;
4599       jj   = *Ccsr->column_indices;
4600       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4601       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4602     } else {
4603       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4604       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4605     }
4606     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4607     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4608     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4609     c->maxnz = c->nz;
4610     c->nonzerorowcnt = 0;
4611     c->rmax = 0;
4612     for (i = 0; i < m; i++) {
4613       const PetscInt nn = c->i[i+1] - c->i[i];
4614       c->ilen[i] = c->imax[i] = nn;
4615       c->nonzerorowcnt += (PetscInt)!!nn;
4616       c->rmax = PetscMax(c->rmax,nn);
4617     }
4618     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4619     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4620     (*C)->nonzerostate++;
4621     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4622     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4623     Ccusp->nonzerostate = (*C)->nonzerostate;
4624     (*C)->preallocated  = PETSC_TRUE;
4625   } else {
4626     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4627     c = (Mat_SeqAIJ*)(*C)->data;
4628     if (c->nz) {
4629       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4630       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4631       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4632       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4633       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4634       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4635       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4636       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4637       Acsr = (CsrMatrix*)Acusp->mat->mat;
4638       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4639       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4640       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4641       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4642       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4643       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4644       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4645       auto pmid = Ccusp->cooPerm->begin();
4646       thrust::advance(pmid,Acsr->num_entries);
4647       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4648       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4649                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4650       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4651                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4652       thrust::for_each(zibait,zieait,VecCUDAEquals());
4653       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4654                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4655       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4656                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4657       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4658       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4659       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4660         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4661         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4662         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4663         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4664         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4665         auto vT = CcsrT->values->begin();
4666         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4667         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4668         (*C)->transupdated = PETSC_TRUE;
4669       }
4670       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4671     }
4672   }
4673   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4674   (*C)->assembled     = PETSC_TRUE;
4675   (*C)->was_assembled = PETSC_FALSE;
4676   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4677   PetscFunctionReturn(0);
4678 }
4679 
4680 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4681 {
4682   PetscErrorCode    ierr;
4683   bool              dmem;
4684   const PetscScalar *av;
4685   cudaError_t       cerr;
4686 
4687   PetscFunctionBegin;
4688   dmem = isCudaMem(v);
4689   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4690   if (n && idx) {
4691     THRUSTINTARRAY widx(n);
4692     widx.assign(idx,idx+n);
4693     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4694 
4695     THRUSTARRAY *w = NULL;
4696     thrust::device_ptr<PetscScalar> dv;
4697     if (dmem) {
4698       dv = thrust::device_pointer_cast(v);
4699     } else {
4700       w = new THRUSTARRAY(n);
4701       dv = w->data();
4702     }
4703     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4704 
4705     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4706     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4707     thrust::for_each(zibit,zieit,VecCUDAEquals());
4708     if (w) {
4709       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4710     }
4711     delete w;
4712   } else {
4713     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4714   }
4715   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4716   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4717   PetscFunctionReturn(0);
4718 }
4719