xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 995517668392c751565fcde851356ac409ca3e6e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     if (!A->boundtocpu) {
169       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
170       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
171     } else {
172       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
173       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
174     }
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
179     if (!A->boundtocpu) {
180       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
181       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
182     } else {
183       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
184       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
185     }
186     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
187     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
188   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
189 
190   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
191   (*B)->canuseordering = PETSC_TRUE;
192   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
197 {
198   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
199 
200   PetscFunctionBegin;
201   switch (op) {
202   case MAT_CUSPARSE_MULT:
203     cusparsestruct->format = format;
204     break;
205   case MAT_CUSPARSE_ALL:
206     cusparsestruct->format = format;
207     break;
208   default:
209     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
210   }
211   PetscFunctionReturn(0);
212 }
213 
214 /*@
215    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
216    operation. Only the MatMult operation can use different GPU storage formats
217    for MPIAIJCUSPARSE matrices.
218    Not Collective
219 
220    Input Parameters:
221 +  A - Matrix of type SEQAIJCUSPARSE
222 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
223 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
224 
225    Output Parameter:
226 
227    Level: intermediate
228 
229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
230 @*/
231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
237   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
242 {
243   PetscErrorCode ierr;
244 
245   PetscFunctionBegin;
246   switch (op) {
247     case MAT_FORM_EXPLICIT_TRANSPOSE:
248       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
249       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
250       A->form_explicit_transpose = flg;
251       break;
252     default:
253       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
254       break;
255   }
256   PetscFunctionReturn(0);
257 }
258 
259 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
260 
261 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
262 {
263   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
264   IS             isrow = b->row,iscol = b->col;
265   PetscBool      row_identity,col_identity;
266   PetscErrorCode ierr;
267 
268   PetscFunctionBegin;
269   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
270   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
271   B->offloadmask = PETSC_OFFLOAD_CPU;
272   /* determine which version of MatSolve needs to be used. */
273   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
274   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
275   if (row_identity && col_identity) {
276     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
277     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   } else {
281     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
282     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
283     B->ops->matsolve = NULL;
284     B->ops->matsolvetranspose = NULL;
285   }
286 
287   /* get the triangular factors */
288   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
293 {
294   PetscErrorCode           ierr;
295   MatCUSPARSEStorageFormat format;
296   PetscBool                flg;
297   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
298 
299   PetscFunctionBegin;
300   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
301   if (A->factortype == MAT_FACTOR_NONE) {
302     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
303                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
304     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
305 
306     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
307                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
308     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
309    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
310     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
311                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
312     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
313 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
314     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
315 #else
316     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
317 #endif
318     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
319                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
320     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
321 
322     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
323                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
324     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
325    #endif
326   }
327   ierr = PetscOptionsTail();CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
332 {
333   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334   PetscErrorCode               ierr;
335 
336   PetscFunctionBegin;
337   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
338   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
339   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
340   PetscFunctionReturn(0);
341 }
342 
343 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
344 {
345   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
346   PetscErrorCode               ierr;
347 
348   PetscFunctionBegin;
349   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
350   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
351   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
352   PetscFunctionReturn(0);
353 }
354 
355 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
356 {
357   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
358   PetscErrorCode               ierr;
359 
360   PetscFunctionBegin;
361   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
362   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
363   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
364   PetscFunctionReturn(0);
365 }
366 
367 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
368 {
369   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
370   PetscErrorCode               ierr;
371 
372   PetscFunctionBegin;
373   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
374   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
375   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
376   PetscFunctionReturn(0);
377 }
378 
379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
380 {
381   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
382   PetscInt                          n = A->rmap->n;
383   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
384   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
385   cusparseStatus_t                  stat;
386   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
387   const MatScalar                   *aa = a->a,*v;
388   PetscInt                          *AiLo, *AjLo;
389   PetscInt                          i,nz, nzLower, offset, rowOffset;
390   PetscErrorCode                    ierr;
391   cudaError_t                       cerr;
392 
393   PetscFunctionBegin;
394   if (!n) PetscFunctionReturn(0);
395   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
396     try {
397       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
398       nzLower=n+ai[n]-ai[1];
399       if (!loTriFactor) {
400         PetscScalar                       *AALo;
401 
402         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
403 
404         /* Allocate Space for the lower triangular matrix */
405         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
406         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
407 
408         /* Fill the lower triangular matrix */
409         AiLo[0]  = (PetscInt) 0;
410         AiLo[n]  = nzLower;
411         AjLo[0]  = (PetscInt) 0;
412         AALo[0]  = (MatScalar) 1.0;
413         v        = aa;
414         vi       = aj;
415         offset   = 1;
416         rowOffset= 1;
417         for (i=1; i<n; i++) {
418           nz = ai[i+1] - ai[i];
419           /* additional 1 for the term on the diagonal */
420           AiLo[i]    = rowOffset;
421           rowOffset += nz+1;
422 
423           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
424           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
425 
426           offset      += nz;
427           AjLo[offset] = (PetscInt) i;
428           AALo[offset] = (MatScalar) 1.0;
429           offset      += 1;
430 
431           v  += nz;
432           vi += nz;
433         }
434 
435         /* allocate space for the triangular factor information */
436         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
437         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
438         /* Create the matrix description */
439         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
440         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
441        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
442         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
443        #else
444         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
445        #endif
446         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
447         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
448 
449         /* set the operation */
450         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
451 
452         /* set the matrix */
453         loTriFactor->csrMat = new CsrMatrix;
454         loTriFactor->csrMat->num_rows = n;
455         loTriFactor->csrMat->num_cols = n;
456         loTriFactor->csrMat->num_entries = nzLower;
457 
458         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
459         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
460 
461         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
462         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
463 
464         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
465         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
466 
467         /* Create the solve analysis information */
468         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
469         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
470       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
472                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
473                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
474                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
475                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
476         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
477       #endif
478 
479         /* perform the solve analysis */
480         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
481                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
482                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
483                                  loTriFactor->csrMat->column_indices->data().get(),
484                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
485                                  loTriFactor->solveInfo,
486                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
487                                #else
488                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
489                                #endif
490         cerr = WaitForCUDA();CHKERRCUDA(cerr);
491         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
492 
493         /* assign the pointer */
494         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
495         loTriFactor->AA_h = AALo;
496         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
497         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
498         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
499       } else { /* update values only */
500         if (!loTriFactor->AA_h) {
501           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
502         }
503         /* Fill the lower triangular matrix */
504         loTriFactor->AA_h[0]  = 1.0;
505         v        = aa;
506         vi       = aj;
507         offset   = 1;
508         for (i=1; i<n; i++) {
509           nz = ai[i+1] - ai[i];
510           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
511           offset      += nz;
512           loTriFactor->AA_h[offset] = 1.0;
513           offset      += 1;
514           v  += nz;
515         }
516         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
517         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
518       }
519     } catch(char *ex) {
520       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
521     }
522   }
523   PetscFunctionReturn(0);
524 }
525 
526 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
527 {
528   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
529   PetscInt                          n = A->rmap->n;
530   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
531   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
532   cusparseStatus_t                  stat;
533   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
534   const MatScalar                   *aa = a->a,*v;
535   PetscInt                          *AiUp, *AjUp;
536   PetscInt                          i,nz, nzUpper, offset;
537   PetscErrorCode                    ierr;
538   cudaError_t                       cerr;
539 
540   PetscFunctionBegin;
541   if (!n) PetscFunctionReturn(0);
542   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
543     try {
544       /* next, figure out the number of nonzeros in the upper triangular matrix. */
545       nzUpper = adiag[0]-adiag[n];
546       if (!upTriFactor) {
547         PetscScalar *AAUp;
548 
549         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
550 
551         /* Allocate Space for the upper triangular matrix */
552         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
553         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
554 
555         /* Fill the upper triangular matrix */
556         AiUp[0]=(PetscInt) 0;
557         AiUp[n]=nzUpper;
558         offset = nzUpper;
559         for (i=n-1; i>=0; i--) {
560           v  = aa + adiag[i+1] + 1;
561           vi = aj + adiag[i+1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i+1]-1;
565 
566           /* decrement the offset */
567           offset -= (nz+1);
568 
569           /* first, set the diagonal elements */
570           AjUp[offset] = (PetscInt) i;
571           AAUp[offset] = (MatScalar)1./v[nz];
572           AiUp[i]      = AiUp[i+1] - (nz+1);
573 
574           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
575           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
576         }
577 
578         /* allocate space for the triangular factor information */
579         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
580         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
581 
582         /* Create the matrix description */
583         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
584         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
585        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
587        #else
588         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
589        #endif
590         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592 
593         /* set the operation */
594         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
595 
596         /* set the matrix */
597         upTriFactor->csrMat = new CsrMatrix;
598         upTriFactor->csrMat->num_rows = n;
599         upTriFactor->csrMat->num_cols = n;
600         upTriFactor->csrMat->num_entries = nzUpper;
601 
602         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
603         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
604 
605         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
606         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
607 
608         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
609         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
610 
611         /* Create the solve analysis information */
612         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
613         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
614       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
616                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
617                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
618                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
619                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
620         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
621       #endif
622 
623         /* perform the solve analysis */
624         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
625                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
626                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
627                                  upTriFactor->csrMat->column_indices->data().get(),
628                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
629                                  upTriFactor->solveInfo,
630                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
631                                #else
632                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
633                                #endif
634         cerr = WaitForCUDA();CHKERRCUDA(cerr);
635         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
636 
637         /* assign the pointer */
638         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
639         upTriFactor->AA_h = AAUp;
640         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
641         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
642         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
643       } else {
644         if (!upTriFactor->AA_h) {
645           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
646         }
647         /* Fill the upper triangular matrix */
648         offset = nzUpper;
649         for (i=n-1; i>=0; i--) {
650           v  = aa + adiag[i+1] + 1;
651 
652           /* number of elements NOT on the diagonal */
653           nz = adiag[i] - adiag[i+1]-1;
654 
655           /* decrement the offset */
656           offset -= (nz+1);
657 
658           /* first, set the diagonal elements */
659           upTriFactor->AA_h[offset] = 1./v[nz];
660           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
661         }
662         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
663         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
664       }
665     } catch(char *ex) {
666       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
667     }
668   }
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
673 {
674   PetscErrorCode               ierr;
675   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
676   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
677   IS                           isrow = a->row,iscol = a->icol;
678   PetscBool                    row_identity,col_identity;
679   PetscInt                     n = A->rmap->n;
680 
681   PetscFunctionBegin;
682   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
683   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
684   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
685 
686   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
687   cusparseTriFactors->nnz=a->nz;
688 
689   A->offloadmask = PETSC_OFFLOAD_BOTH;
690   /* lower triangular indices */
691   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
692   if (!row_identity && !cusparseTriFactors->rpermIndices) {
693     const PetscInt *r;
694 
695     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
696     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
697     cusparseTriFactors->rpermIndices->assign(r, r+n);
698     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
699     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
700   }
701 
702   /* upper triangular indices */
703   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
704   if (!col_identity && !cusparseTriFactors->cpermIndices) {
705     const PetscInt *c;
706 
707     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
708     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
709     cusparseTriFactors->cpermIndices->assign(c, c+n);
710     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
711     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
712   }
713   PetscFunctionReturn(0);
714 }
715 
716 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
717 {
718   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
719   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
720   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
721   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
722   cusparseStatus_t                  stat;
723   PetscErrorCode                    ierr;
724   cudaError_t                       cerr;
725   PetscInt                          *AiUp, *AjUp;
726   PetscScalar                       *AAUp;
727   PetscScalar                       *AALo;
728   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
729   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
730   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
731   const MatScalar                   *aa = b->a,*v;
732 
733   PetscFunctionBegin;
734   if (!n) PetscFunctionReturn(0);
735   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
736     try {
737       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
738       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
739       if (!upTriFactor && !loTriFactor) {
740         /* Allocate Space for the upper triangular matrix */
741         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
742         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
743 
744         /* Fill the upper triangular matrix */
745         AiUp[0]=(PetscInt) 0;
746         AiUp[n]=nzUpper;
747         offset = 0;
748         for (i=0; i<n; i++) {
749           /* set the pointers */
750           v  = aa + ai[i];
751           vj = aj + ai[i];
752           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
753 
754           /* first, set the diagonal elements */
755           AjUp[offset] = (PetscInt) i;
756           AAUp[offset] = (MatScalar)1.0/v[nz];
757           AiUp[i]      = offset;
758           AALo[offset] = (MatScalar)1.0/v[nz];
759 
760           offset+=1;
761           if (nz>0) {
762             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
763             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
764             for (j=offset; j<offset+nz; j++) {
765               AAUp[j] = -AAUp[j];
766               AALo[j] = AAUp[j]/v[nz];
767             }
768             offset+=nz;
769           }
770         }
771 
772         /* allocate space for the triangular factor information */
773         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
774         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
775 
776         /* Create the matrix description */
777         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
778         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
779        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
780         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
781        #else
782         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
783        #endif
784         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
785         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
786 
787         /* set the matrix */
788         upTriFactor->csrMat = new CsrMatrix;
789         upTriFactor->csrMat->num_rows = A->rmap->n;
790         upTriFactor->csrMat->num_cols = A->cmap->n;
791         upTriFactor->csrMat->num_entries = a->nz;
792 
793         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
794         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
795 
796         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
797         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
798 
799         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
800         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
801 
802         /* set the operation */
803         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
804 
805         /* Create the solve analysis information */
806         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
807         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
808       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
810                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
811                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
812                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
813                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
814         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
815       #endif
816 
817         /* perform the solve analysis */
818         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
819                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
820                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
821                                  upTriFactor->csrMat->column_indices->data().get(),
822                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823                                  upTriFactor->solveInfo,
824                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
825                                 #else
826                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
827                                 #endif
828         cerr = WaitForCUDA();CHKERRCUDA(cerr);
829         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
830 
831         /* assign the pointer */
832         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
833 
834         /* allocate space for the triangular factor information */
835         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
836         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
837 
838         /* Create the matrix description */
839         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
840         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
841        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
842         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
843        #else
844         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
845        #endif
846         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
847         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
848 
849         /* set the operation */
850         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
851 
852         /* set the matrix */
853         loTriFactor->csrMat = new CsrMatrix;
854         loTriFactor->csrMat->num_rows = A->rmap->n;
855         loTriFactor->csrMat->num_cols = A->cmap->n;
856         loTriFactor->csrMat->num_entries = a->nz;
857 
858         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
859         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
860 
861         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
862         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
863 
864         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
865         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
866 
867         /* Create the solve analysis information */
868         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
869         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
870       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
872                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
873                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
874                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
875                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
876         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
877       #endif
878 
879         /* perform the solve analysis */
880         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
881                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
882                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
883                                  loTriFactor->csrMat->column_indices->data().get(),
884                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
885                                  loTriFactor->solveInfo,
886                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
887                                 #else
888                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
889                                 #endif
890         cerr = WaitForCUDA();CHKERRCUDA(cerr);
891         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
892 
893         /* assign the pointer */
894         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
895 
896         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
897         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
898         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
899       } else {
900         /* Fill the upper triangular matrix */
901         offset = 0;
902         for (i=0; i<n; i++) {
903           /* set the pointers */
904           v  = aa + ai[i];
905           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
906 
907           /* first, set the diagonal elements */
908           AAUp[offset] = 1.0/v[nz];
909           AALo[offset] = 1.0/v[nz];
910 
911           offset+=1;
912           if (nz>0) {
913             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
914             for (j=offset; j<offset+nz; j++) {
915               AAUp[j] = -AAUp[j];
916               AALo[j] = AAUp[j]/v[nz];
917             }
918             offset+=nz;
919           }
920         }
921         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
922         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
923         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
924         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
925         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
926       }
927       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
928       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
929     } catch(char *ex) {
930       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
931     }
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
937 {
938   PetscErrorCode               ierr;
939   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
940   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
941   IS                           ip = a->row;
942   PetscBool                    perm_identity;
943   PetscInt                     n = A->rmap->n;
944 
945   PetscFunctionBegin;
946   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
948   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
949   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
950 
951   A->offloadmask = PETSC_OFFLOAD_BOTH;
952 
953   /* lower triangular indices */
954   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
955   if (!perm_identity) {
956     IS             iip;
957     const PetscInt *irip,*rip;
958 
959     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
960     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
961     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
962     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
963     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
964     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
965     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
966     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
967     ierr = ISDestroy(&iip);CHKERRQ(ierr);
968     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
969     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
970   }
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
975 {
976   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
977   IS             ip = b->row;
978   PetscBool      perm_identity;
979   PetscErrorCode ierr;
980 
981   PetscFunctionBegin;
982   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
983   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
984   B->offloadmask = PETSC_OFFLOAD_CPU;
985   /* determine which version of MatSolve needs to be used. */
986   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
987   if (perm_identity) {
988     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
989     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
990     B->ops->matsolve = NULL;
991     B->ops->matsolvetranspose = NULL;
992   } else {
993     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
994     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
995     B->ops->matsolve = NULL;
996     B->ops->matsolvetranspose = NULL;
997   }
998 
999   /* get the triangular factors */
1000   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1005 {
1006   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1007   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1008   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1009   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1010   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1011   cusparseStatus_t                  stat;
1012   cusparseIndexBase_t               indexBase;
1013   cusparseMatrixType_t              matrixType;
1014   cusparseFillMode_t                fillMode;
1015   cusparseDiagType_t                diagType;
1016   cudaError_t                       cerr;
1017   PetscErrorCode                    ierr;
1018 
1019   PetscFunctionBegin;
1020   /* allocate space for the transpose of the lower triangular factor */
1021   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1022   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1023 
1024   /* set the matrix descriptors of the lower triangular factor */
1025   matrixType = cusparseGetMatType(loTriFactor->descr);
1026   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1027   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1028     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1029   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1030 
1031   /* Create the matrix description */
1032   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1033   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1034   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1035   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1036   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1037 
1038   /* set the operation */
1039   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1040 
1041   /* allocate GPU space for the CSC of the lower triangular factor*/
1042   loTriFactorT->csrMat = new CsrMatrix;
1043   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1044   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1045   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1046   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1047   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1048   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1049 
1050   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1052   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1053                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1054                                        loTriFactor->csrMat->values->data().get(),
1055                                        loTriFactor->csrMat->row_offsets->data().get(),
1056                                        loTriFactor->csrMat->column_indices->data().get(),
1057                                        loTriFactorT->csrMat->values->data().get(),
1058                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1060                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1061   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1062 #endif
1063 
1064   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1065   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1066                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1067                           loTriFactor->csrMat->values->data().get(),
1068                           loTriFactor->csrMat->row_offsets->data().get(),
1069                           loTriFactor->csrMat->column_indices->data().get(),
1070                           loTriFactorT->csrMat->values->data().get(),
1071                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1072                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1073                           CUSPARSE_ACTION_NUMERIC, indexBase,
1074                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1075                         #else
1076                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1078                         #endif
1079   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1080   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1081 
1082   /* Create the solve analysis information */
1083   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1084   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1087                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1088                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1089                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1090                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1091   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1092 #endif
1093 
1094   /* perform the solve analysis */
1095   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1096                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1097                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1098                            loTriFactorT->csrMat->column_indices->data().get(),
1099                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1100                            loTriFactorT->solveInfo,
1101                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1102                           #else
1103                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1104                           #endif
1105   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1106   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1107 
1108   /* assign the pointer */
1109   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1110 
1111   /*********************************************/
1112   /* Now the Transpose of the Upper Tri Factor */
1113   /*********************************************/
1114 
1115   /* allocate space for the transpose of the upper triangular factor */
1116   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1117   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1118 
1119   /* set the matrix descriptors of the upper triangular factor */
1120   matrixType = cusparseGetMatType(upTriFactor->descr);
1121   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1122   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1123     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1124   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1125 
1126   /* Create the matrix description */
1127   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1128   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1129   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1130   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1131   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1132 
1133   /* set the operation */
1134   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1135 
1136   /* allocate GPU space for the CSC of the upper triangular factor*/
1137   upTriFactorT->csrMat = new CsrMatrix;
1138   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1139   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1140   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1141   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1142   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1143   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1144 
1145   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1148                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1149                                 upTriFactor->csrMat->values->data().get(),
1150                                 upTriFactor->csrMat->row_offsets->data().get(),
1151                                 upTriFactor->csrMat->column_indices->data().get(),
1152                                 upTriFactorT->csrMat->values->data().get(),
1153                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1155                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1156   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1157 #endif
1158 
1159   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1160   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1161                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1162                           upTriFactor->csrMat->values->data().get(),
1163                           upTriFactor->csrMat->row_offsets->data().get(),
1164                           upTriFactor->csrMat->column_indices->data().get(),
1165                           upTriFactorT->csrMat->values->data().get(),
1166                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1168                           CUSPARSE_ACTION_NUMERIC, indexBase,
1169                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1170                         #else
1171                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1173                         #endif
1174 
1175   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1176   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1177 
1178   /* Create the solve analysis information */
1179   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1180   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1181   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1183                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1184                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1185                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1186                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1187   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1188   #endif
1189 
1190   /* perform the solve analysis */
1191   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1192                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1193                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1194                            upTriFactorT->csrMat->column_indices->data().get(),
1195                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1196                            upTriFactorT->solveInfo,
1197                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1198                           #else
1199                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1200                           #endif
1201 
1202   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1203   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1204 
1205   /* assign the pointer */
1206   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 struct PetscScalarToPetscInt
1211 {
1212   __host__ __device__
1213   PetscInt operator()(PetscScalar s)
1214   {
1215     return (PetscInt)PetscRealPart(s);
1216   }
1217 };
1218 
1219 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1220 {
1221   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1222   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1223   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1224   cusparseStatus_t             stat;
1225   cusparseIndexBase_t          indexBase;
1226   cudaError_t                  err;
1227   PetscErrorCode               ierr;
1228 
1229   PetscFunctionBegin;
1230   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1231   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1232   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1233   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1234   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1235   if (A->transupdated) PetscFunctionReturn(0);
1236   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1237   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1238   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1239     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1240   }
1241   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1242     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1243     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1244     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1245     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1246     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1247 
1248     /* set alpha and beta */
1249     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1250     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1251     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1252     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1253     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1254     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255 
1256     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1257       CsrMatrix *matrixT = new CsrMatrix;
1258       matstructT->mat = matrixT;
1259       matrixT->num_rows = A->cmap->n;
1260       matrixT->num_cols = A->rmap->n;
1261       matrixT->num_entries = a->nz;
1262       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1263       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1264       matrixT->values = new THRUSTARRAY(a->nz);
1265 
1266       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1267       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1268 
1269      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1270       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1271         stat = cusparseCreateCsr(&matstructT->matDescr,
1272                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1273                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1274                                matrixT->values->data().get(),
1275                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1276                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1277       #else
1278         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1279            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1280 
1281            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1282            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1283            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1284         */
1285         if (matrixT->num_entries) {
1286           stat = cusparseCreateCsr(&matstructT->matDescr,
1287                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1288                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1289                                  matrixT->values->data().get(),
1290                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1291                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1292 
1293         } else {
1294           matstructT->matDescr = NULL;
1295           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1296         }
1297       #endif
1298      #endif
1299     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1300    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1301       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1302    #else
1303       CsrMatrix *temp  = new CsrMatrix;
1304       CsrMatrix *tempT = new CsrMatrix;
1305       /* First convert HYB to CSR */
1306       temp->num_rows = A->rmap->n;
1307       temp->num_cols = A->cmap->n;
1308       temp->num_entries = a->nz;
1309       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1310       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1311       temp->values = new THRUSTARRAY(a->nz);
1312 
1313       stat = cusparse_hyb2csr(cusparsestruct->handle,
1314                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1315                               temp->values->data().get(),
1316                               temp->row_offsets->data().get(),
1317                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1318 
1319       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1320       tempT->num_rows = A->rmap->n;
1321       tempT->num_cols = A->cmap->n;
1322       tempT->num_entries = a->nz;
1323       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1324       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1325       tempT->values = new THRUSTARRAY(a->nz);
1326 
1327       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1328                               temp->num_cols, temp->num_entries,
1329                               temp->values->data().get(),
1330                               temp->row_offsets->data().get(),
1331                               temp->column_indices->data().get(),
1332                               tempT->values->data().get(),
1333                               tempT->column_indices->data().get(),
1334                               tempT->row_offsets->data().get(),
1335                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1336 
1337       /* Last, convert CSC to HYB */
1338       cusparseHybMat_t hybMat;
1339       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1340       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1341         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1342       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1343                               matstructT->descr, tempT->values->data().get(),
1344                               tempT->row_offsets->data().get(),
1345                               tempT->column_indices->data().get(),
1346                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1347 
1348       /* assign the pointer */
1349       matstructT->mat = hybMat;
1350       A->transupdated = PETSC_TRUE;
1351       /* delete temporaries */
1352       if (tempT) {
1353         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1354         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1355         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1356         delete (CsrMatrix*) tempT;
1357       }
1358       if (temp) {
1359         if (temp->values) delete (THRUSTARRAY*) temp->values;
1360         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1361         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1362         delete (CsrMatrix*) temp;
1363       }
1364      #endif
1365     }
1366   }
1367   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1368     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1369     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1370     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1371     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1372     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1373     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1374     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1375     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1376     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1377     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1378     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1379       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1380       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1381       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1382     }
1383     if (!cusparsestruct->csr2csc_i) {
1384       THRUSTARRAY csr2csc_a(matrix->num_entries);
1385       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1386 
1387       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1388      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389       void   *csr2cscBuffer;
1390       size_t csr2cscBufferSize;
1391       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1392                                            A->cmap->n, matrix->num_entries,
1393                                            matrix->values->data().get(),
1394                                            cusparsestruct->rowoffsets_gpu->data().get(),
1395                                            matrix->column_indices->data().get(),
1396                                            matrixT->values->data().get(),
1397                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1398                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1399                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1400       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1401      #endif
1402 
1403       if (matrix->num_entries) {
1404         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1405            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1406            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1407 
1408            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1409            should be filled with indexBase. So I just take a shortcut here.
1410         */
1411         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1412                               A->cmap->n,matrix->num_entries,
1413                               csr2csc_a.data().get(),
1414                               cusparsestruct->rowoffsets_gpu->data().get(),
1415                               matrix->column_indices->data().get(),
1416                               matrixT->values->data().get(),
1417                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1418                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1419                               CUSPARSE_ACTION_NUMERIC,indexBase,
1420                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1421                              #else
1422                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1423                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1424                              #endif
1425       } else {
1426         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1427       }
1428 
1429       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1430       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1431      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1432       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1433      #endif
1434     }
1435     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1436                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1437                                                      matrixT->values->begin()));
1438   }
1439   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1440   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1441   /* the compressed row indices is not used for matTranspose */
1442   matstructT->cprowIndices = NULL;
1443   /* assign the pointer */
1444   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1445   A->transupdated = PETSC_TRUE;
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1450 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1451 {
1452   PetscInt                              n = xx->map->n;
1453   const PetscScalar                     *barray;
1454   PetscScalar                           *xarray;
1455   thrust::device_ptr<const PetscScalar> bGPU;
1456   thrust::device_ptr<PetscScalar>       xGPU;
1457   cusparseStatus_t                      stat;
1458   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1459   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1462   PetscErrorCode                        ierr;
1463 
1464   PetscFunctionBegin;
1465   /* Analyze the matrix and create the transpose ... on the fly */
1466   if (!loTriFactorT && !upTriFactorT) {
1467     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1468     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1469     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1470   }
1471 
1472   /* Get the GPU pointers */
1473   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1474   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1475   xGPU = thrust::device_pointer_cast(xarray);
1476   bGPU = thrust::device_pointer_cast(barray);
1477 
1478   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1479   /* First, reorder with the row permutation */
1480   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1481                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1482                xGPU);
1483 
1484   /* First, solve U */
1485   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1486                         upTriFactorT->csrMat->num_rows,
1487                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1488                         upTriFactorT->csrMat->num_entries,
1489                       #endif
1490                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1491                         upTriFactorT->csrMat->values->data().get(),
1492                         upTriFactorT->csrMat->row_offsets->data().get(),
1493                         upTriFactorT->csrMat->column_indices->data().get(),
1494                         upTriFactorT->solveInfo,
1495                         xarray,
1496                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1497                         tempGPU->data().get(),
1498                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1499                       #else
1500                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1501                       #endif
1502 
1503   /* Then, solve L */
1504   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1505                         loTriFactorT->csrMat->num_rows,
1506                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1507                         loTriFactorT->csrMat->num_entries,
1508                       #endif
1509                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1510                         loTriFactorT->csrMat->values->data().get(),
1511                         loTriFactorT->csrMat->row_offsets->data().get(),
1512                         loTriFactorT->csrMat->column_indices->data().get(),
1513                         loTriFactorT->solveInfo,
1514                         tempGPU->data().get(),
1515                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1516                         xarray,
1517                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1518                       #else
1519                          xarray);CHKERRCUSPARSE(stat);
1520                       #endif
1521 
1522   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1523   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1524                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1525                tempGPU->begin());
1526 
1527   /* Copy the temporary to the full solution. */
1528   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1529 
1530   /* restore */
1531   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1532   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1533   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1534   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1539 {
1540   const PetscScalar                 *barray;
1541   PetscScalar                       *xarray;
1542   cusparseStatus_t                  stat;
1543   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1544   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1545   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1546   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1547   PetscErrorCode                    ierr;
1548 
1549   PetscFunctionBegin;
1550   /* Analyze the matrix and create the transpose ... on the fly */
1551   if (!loTriFactorT && !upTriFactorT) {
1552     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1553     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1554     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1555   }
1556 
1557   /* Get the GPU pointers */
1558   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1559   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1560 
1561   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1562   /* First, solve U */
1563   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1564                         upTriFactorT->csrMat->num_rows,
1565                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1566                         upTriFactorT->csrMat->num_entries,
1567                       #endif
1568                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1569                         upTriFactorT->csrMat->values->data().get(),
1570                         upTriFactorT->csrMat->row_offsets->data().get(),
1571                         upTriFactorT->csrMat->column_indices->data().get(),
1572                         upTriFactorT->solveInfo,
1573                         barray,
1574                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1575                         tempGPU->data().get(),
1576                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1577                       #else
1578                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1579                       #endif
1580 
1581   /* Then, solve L */
1582   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1583                         loTriFactorT->csrMat->num_rows,
1584                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1585                         loTriFactorT->csrMat->num_entries,
1586                       #endif
1587                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1588                         loTriFactorT->csrMat->values->data().get(),
1589                         loTriFactorT->csrMat->row_offsets->data().get(),
1590                         loTriFactorT->csrMat->column_indices->data().get(),
1591                         loTriFactorT->solveInfo,
1592                         tempGPU->data().get(),
1593                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1594                         xarray,
1595                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1596                       #else
1597                         xarray);CHKERRCUSPARSE(stat);
1598                       #endif
1599 
1600   /* restore */
1601   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1602   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1603   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1604   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1609 {
1610   const PetscScalar                     *barray;
1611   PetscScalar                           *xarray;
1612   thrust::device_ptr<const PetscScalar> bGPU;
1613   thrust::device_ptr<PetscScalar>       xGPU;
1614   cusparseStatus_t                      stat;
1615   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1616   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1617   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1618   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1619   PetscErrorCode                        ierr;
1620 
1621   PetscFunctionBegin;
1622 
1623   /* Get the GPU pointers */
1624   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1625   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1626   xGPU = thrust::device_pointer_cast(xarray);
1627   bGPU = thrust::device_pointer_cast(barray);
1628 
1629   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1630   /* First, reorder with the row permutation */
1631   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1632                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1633                tempGPU->begin());
1634 
1635   /* Next, solve L */
1636   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1637                         loTriFactor->csrMat->num_rows,
1638                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1639                         loTriFactor->csrMat->num_entries,
1640                       #endif
1641                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1642                         loTriFactor->csrMat->values->data().get(),
1643                         loTriFactor->csrMat->row_offsets->data().get(),
1644                         loTriFactor->csrMat->column_indices->data().get(),
1645                         loTriFactor->solveInfo,
1646                         tempGPU->data().get(),
1647                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1648                          xarray,
1649                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1650                       #else
1651                          xarray);CHKERRCUSPARSE(stat);
1652                       #endif
1653 
1654   /* Then, solve U */
1655   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1656                         upTriFactor->csrMat->num_rows,
1657                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1658                         upTriFactor->csrMat->num_entries,
1659                       #endif
1660                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1661                         upTriFactor->csrMat->values->data().get(),
1662                         upTriFactor->csrMat->row_offsets->data().get(),
1663                         upTriFactor->csrMat->column_indices->data().get(),
1664                         upTriFactor->solveInfo,xarray,
1665                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1666                         tempGPU->data().get(),
1667                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1668                       #else
1669                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1670                       #endif
1671 
1672   /* Last, reorder with the column permutation */
1673   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1674                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1675                xGPU);
1676 
1677   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1678   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1679   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1680   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1685 {
1686   const PetscScalar                 *barray;
1687   PetscScalar                       *xarray;
1688   cusparseStatus_t                  stat;
1689   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1690   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1691   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1692   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1693   PetscErrorCode                    ierr;
1694 
1695   PetscFunctionBegin;
1696   /* Get the GPU pointers */
1697   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1698   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1699 
1700   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1701   /* First, solve L */
1702   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1703                         loTriFactor->csrMat->num_rows,
1704                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1705                         loTriFactor->csrMat->num_entries,
1706                       #endif
1707                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1708                         loTriFactor->csrMat->values->data().get(),
1709                         loTriFactor->csrMat->row_offsets->data().get(),
1710                         loTriFactor->csrMat->column_indices->data().get(),
1711                         loTriFactor->solveInfo,
1712                         barray,
1713                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1714                         tempGPU->data().get(),
1715                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1716                       #else
1717                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1718                       #endif
1719 
1720   /* Next, solve U */
1721   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1722                         upTriFactor->csrMat->num_rows,
1723                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1724                         upTriFactor->csrMat->num_entries,
1725                       #endif
1726                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1727                         upTriFactor->csrMat->values->data().get(),
1728                         upTriFactor->csrMat->row_offsets->data().get(),
1729                         upTriFactor->csrMat->column_indices->data().get(),
1730                         upTriFactor->solveInfo,
1731                         tempGPU->data().get(),
1732                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1733                         xarray,
1734                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1735                       #else
1736                         xarray);CHKERRCUSPARSE(stat);
1737                       #endif
1738 
1739   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1740   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1741   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1742   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1747 {
1748   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1749   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1750   cudaError_t        cerr;
1751   PetscErrorCode     ierr;
1752 
1753   PetscFunctionBegin;
1754   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1755     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1756 
1757     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1758     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1759     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1760     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1761     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1762     A->offloadmask = PETSC_OFFLOAD_BOTH;
1763   }
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1768 {
1769   PetscErrorCode ierr;
1770 
1771   PetscFunctionBegin;
1772   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1773   *array = ((Mat_SeqAIJ*)A->data)->a;
1774   PetscFunctionReturn(0);
1775 }
1776 
1777 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1778 {
1779   PetscFunctionBegin;
1780   A->offloadmask = PETSC_OFFLOAD_CPU;
1781   *array         = NULL;
1782   PetscFunctionReturn(0);
1783 }
1784 
1785 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1786 {
1787   PetscErrorCode ierr;
1788 
1789   PetscFunctionBegin;
1790   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1791   *array = ((Mat_SeqAIJ*)A->data)->a;
1792   PetscFunctionReturn(0);
1793 }
1794 
1795 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1796 {
1797   PetscFunctionBegin;
1798   *array = NULL;
1799   PetscFunctionReturn(0);
1800 }
1801 
1802 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1803 {
1804   PetscFunctionBegin;
1805   *array = ((Mat_SeqAIJ*)A->data)->a;
1806   PetscFunctionReturn(0);
1807 }
1808 
1809 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1810 {
1811   PetscFunctionBegin;
1812   A->offloadmask = PETSC_OFFLOAD_CPU;
1813   *array         = NULL;
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1818 {
1819   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1820   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1821   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1822   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1823   PetscErrorCode               ierr;
1824   cusparseStatus_t             stat;
1825   PetscBool                    both = PETSC_TRUE;
1826   cudaError_t                  err;
1827 
1828   PetscFunctionBegin;
1829   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1830   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1831     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1832       CsrMatrix *matrix;
1833       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1834 
1835       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1836       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1837       matrix->values->assign(a->a, a->a+a->nz);
1838       err  = WaitForCUDA();CHKERRCUDA(err);
1839       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1840       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1841       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1842     } else {
1843       PetscInt nnz;
1844       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1845       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1846       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1847       delete cusparsestruct->workVector;
1848       delete cusparsestruct->rowoffsets_gpu;
1849       cusparsestruct->workVector = NULL;
1850       cusparsestruct->rowoffsets_gpu = NULL;
1851       try {
1852         if (a->compressedrow.use) {
1853           m    = a->compressedrow.nrows;
1854           ii   = a->compressedrow.i;
1855           ridx = a->compressedrow.rindex;
1856         } else {
1857           m    = A->rmap->n;
1858           ii   = a->i;
1859           ridx = NULL;
1860         }
1861         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1862         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1863         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1864         else nnz = a->nz;
1865 
1866         /* create cusparse matrix */
1867         cusparsestruct->nrows = m;
1868         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1869         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1870         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1871         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1872 
1873         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1874         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1875         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1876         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1877         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1878         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1879         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1880 
1881         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1882         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1883           /* set the matrix */
1884           CsrMatrix *mat= new CsrMatrix;
1885           mat->num_rows = m;
1886           mat->num_cols = A->cmap->n;
1887           mat->num_entries = nnz;
1888           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1889           mat->row_offsets->assign(ii, ii + m+1);
1890 
1891           mat->column_indices = new THRUSTINTARRAY32(nnz);
1892           mat->column_indices->assign(a->j, a->j+nnz);
1893 
1894           mat->values = new THRUSTARRAY(nnz);
1895           if (a->a) mat->values->assign(a->a, a->a+nnz);
1896 
1897           /* assign the pointer */
1898           matstruct->mat = mat;
1899          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1900           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1901             stat = cusparseCreateCsr(&matstruct->matDescr,
1902                                     mat->num_rows, mat->num_cols, mat->num_entries,
1903                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1904                                     mat->values->data().get(),
1905                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1906                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1907           }
1908          #endif
1909         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1910          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1911           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1912          #else
1913           CsrMatrix *mat= new CsrMatrix;
1914           mat->num_rows = m;
1915           mat->num_cols = A->cmap->n;
1916           mat->num_entries = nnz;
1917           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1918           mat->row_offsets->assign(ii, ii + m+1);
1919 
1920           mat->column_indices = new THRUSTINTARRAY32(nnz);
1921           mat->column_indices->assign(a->j, a->j+nnz);
1922 
1923           mat->values = new THRUSTARRAY(nnz);
1924           if (a->a) mat->values->assign(a->a, a->a+nnz);
1925 
1926           cusparseHybMat_t hybMat;
1927           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1928           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1929             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1930           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1931               matstruct->descr, mat->values->data().get(),
1932               mat->row_offsets->data().get(),
1933               mat->column_indices->data().get(),
1934               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1935           /* assign the pointer */
1936           matstruct->mat = hybMat;
1937 
1938           if (mat) {
1939             if (mat->values) delete (THRUSTARRAY*)mat->values;
1940             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1941             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1942             delete (CsrMatrix*)mat;
1943           }
1944          #endif
1945         }
1946 
1947         /* assign the compressed row indices */
1948         if (a->compressedrow.use) {
1949           cusparsestruct->workVector = new THRUSTARRAY(m);
1950           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1951           matstruct->cprowIndices->assign(ridx,ridx+m);
1952           tmp = m;
1953         } else {
1954           cusparsestruct->workVector = NULL;
1955           matstruct->cprowIndices    = NULL;
1956           tmp = 0;
1957         }
1958         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1959 
1960         /* assign the pointer */
1961         cusparsestruct->mat = matstruct;
1962       } catch(char *ex) {
1963         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1964       }
1965       err  = WaitForCUDA();CHKERRCUDA(err);
1966       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1967       cusparsestruct->nonzerostate = A->nonzerostate;
1968     }
1969     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1970   }
1971   PetscFunctionReturn(0);
1972 }
1973 
1974 struct VecCUDAPlusEquals
1975 {
1976   template <typename Tuple>
1977   __host__ __device__
1978   void operator()(Tuple t)
1979   {
1980     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1981   }
1982 };
1983 
1984 struct VecCUDAEquals
1985 {
1986   template <typename Tuple>
1987   __host__ __device__
1988   void operator()(Tuple t)
1989   {
1990     thrust::get<1>(t) = thrust::get<0>(t);
1991   }
1992 };
1993 
1994 struct VecCUDAEqualsReverse
1995 {
1996   template <typename Tuple>
1997   __host__ __device__
1998   void operator()(Tuple t)
1999   {
2000     thrust::get<0>(t) = thrust::get<1>(t);
2001   }
2002 };
2003 
2004 struct MatMatCusparse {
2005   PetscBool             cisdense;
2006   PetscScalar           *Bt;
2007   Mat                   X;
2008   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2009   PetscLogDouble        flops;
2010   CsrMatrix             *Bcsr;
2011 
2012 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2013   cusparseSpMatDescr_t  matSpBDescr;
2014   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2015   cusparseDnMatDescr_t  matBDescr;
2016   cusparseDnMatDescr_t  matCDescr;
2017   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2018  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2019   void                  *dBuffer4;
2020   void                  *dBuffer5;
2021  #endif
2022   size_t                mmBufferSize;
2023   void                  *mmBuffer;
2024   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2025   cusparseSpGEMMDescr_t spgemmDesc;
2026 #endif
2027 };
2028 
2029 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2030 {
2031   PetscErrorCode   ierr;
2032   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2033   cudaError_t      cerr;
2034  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2035   cusparseStatus_t stat;
2036  #endif
2037 
2038   PetscFunctionBegin;
2039   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2040   delete mmdata->Bcsr;
2041  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2042   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2043   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2044   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2045   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2046  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2047   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2048   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2049  #endif
2050   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2051   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2052  #endif
2053   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2054   ierr = PetscFree(data);CHKERRQ(ierr);
2055   PetscFunctionReturn(0);
2056 }
2057 
2058 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2059 
2060 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2061 {
2062   Mat_Product                  *product = C->product;
2063   Mat                          A,B;
2064   PetscInt                     m,n,blda,clda;
2065   PetscBool                    flg,biscuda;
2066   Mat_SeqAIJCUSPARSE           *cusp;
2067   cusparseStatus_t             stat;
2068   cusparseOperation_t          opA;
2069   const PetscScalar            *barray;
2070   PetscScalar                  *carray;
2071   PetscErrorCode               ierr;
2072   MatMatCusparse               *mmdata;
2073   Mat_SeqAIJCUSPARSEMultStruct *mat;
2074   CsrMatrix                    *csrmat;
2075 
2076   PetscFunctionBegin;
2077   MatCheckProduct(C,1);
2078   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2079   mmdata = (MatMatCusparse*)product->data;
2080   A    = product->A;
2081   B    = product->B;
2082   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2083   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2084   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2085      Instead of silently accepting the wrong answer, I prefer to raise the error */
2086   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2087   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2088   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2089   switch (product->type) {
2090   case MATPRODUCT_AB:
2091   case MATPRODUCT_PtAP:
2092     mat = cusp->mat;
2093     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2094     m   = A->rmap->n;
2095     n   = B->cmap->n;
2096     break;
2097   case MATPRODUCT_AtB:
2098     if (!A->form_explicit_transpose) {
2099       mat = cusp->mat;
2100       opA = CUSPARSE_OPERATION_TRANSPOSE;
2101     } else {
2102       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2103       mat  = cusp->matTranspose;
2104       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105     }
2106     m = A->cmap->n;
2107     n = B->cmap->n;
2108     break;
2109   case MATPRODUCT_ABt:
2110   case MATPRODUCT_RARt:
2111     mat = cusp->mat;
2112     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2113     m   = A->rmap->n;
2114     n   = B->rmap->n;
2115     break;
2116   default:
2117     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2118   }
2119   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2120   csrmat = (CsrMatrix*)mat->mat;
2121   /* if the user passed a CPU matrix, copy the data to the GPU */
2122   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2123   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2124   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2125 
2126   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2127   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2128     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2129     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2130   } else {
2131     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2132     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2133   }
2134 
2135   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2136  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2138   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2139   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2140     size_t mmBufferSize;
2141     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2142     if (!mmdata->matBDescr) {
2143       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2144       mmdata->Blda = blda;
2145     }
2146 
2147     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2148     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2149       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2150       mmdata->Clda = clda;
2151     }
2152 
2153     if (!mat->matDescr) {
2154       stat = cusparseCreateCsr(&mat->matDescr,
2155                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2156                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2157                                csrmat->values->data().get(),
2158                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2159                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2160     }
2161     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2162                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2163                                    mmdata->matCDescr,cusparse_scalartype,
2164                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2165     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2166       cudaError_t cerr;
2167       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2168       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2169       mmdata->mmBufferSize = mmBufferSize;
2170     }
2171     mmdata->initialized = PETSC_TRUE;
2172   } else {
2173     /* to be safe, always update pointers of the mats */
2174     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2175     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2176     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2177   }
2178 
2179   /* do cusparseSpMM, which supports transpose on B */
2180   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2181                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2182                       mmdata->matCDescr,cusparse_scalartype,
2183                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2184  #else
2185   PetscInt k;
2186   /* cusparseXcsrmm does not support transpose on B */
2187   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2188     cublasHandle_t cublasv2handle;
2189     cublasStatus_t cerr;
2190 
2191     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2192     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2193                        B->cmap->n,B->rmap->n,
2194                        &PETSC_CUSPARSE_ONE ,barray,blda,
2195                        &PETSC_CUSPARSE_ZERO,barray,blda,
2196                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2197     blda = B->cmap->n;
2198     k    = B->cmap->n;
2199   } else {
2200     k    = B->rmap->n;
2201   }
2202 
2203   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2204   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2205                            csrmat->num_entries,mat->alpha_one,mat->descr,
2206                            csrmat->values->data().get(),
2207                            csrmat->row_offsets->data().get(),
2208                            csrmat->column_indices->data().get(),
2209                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2210                            carray,clda);CHKERRCUSPARSE(stat);
2211  #endif
2212   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2213   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2214   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2215   if (product->type == MATPRODUCT_RARt) {
2216     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2217     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2218   } else if (product->type == MATPRODUCT_PtAP) {
2219     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2220     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2221   } else {
2222     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2223   }
2224   if (mmdata->cisdense) {
2225     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2226   }
2227   if (!biscuda) {
2228     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2229   }
2230   PetscFunctionReturn(0);
2231 }
2232 
2233 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2234 {
2235   Mat_Product        *product = C->product;
2236   Mat                A,B;
2237   PetscInt           m,n;
2238   PetscBool          cisdense,flg;
2239   PetscErrorCode     ierr;
2240   MatMatCusparse     *mmdata;
2241   Mat_SeqAIJCUSPARSE *cusp;
2242 
2243   PetscFunctionBegin;
2244   MatCheckProduct(C,1);
2245   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2246   A    = product->A;
2247   B    = product->B;
2248   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2249   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2250   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2251   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2252   switch (product->type) {
2253   case MATPRODUCT_AB:
2254     m = A->rmap->n;
2255     n = B->cmap->n;
2256     break;
2257   case MATPRODUCT_AtB:
2258     m = A->cmap->n;
2259     n = B->cmap->n;
2260     break;
2261   case MATPRODUCT_ABt:
2262     m = A->rmap->n;
2263     n = B->rmap->n;
2264     break;
2265   case MATPRODUCT_PtAP:
2266     m = B->cmap->n;
2267     n = B->cmap->n;
2268     break;
2269   case MATPRODUCT_RARt:
2270     m = B->rmap->n;
2271     n = B->rmap->n;
2272     break;
2273   default:
2274     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2275   }
2276   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2277   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2278   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2279   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2280 
2281   /* product data */
2282   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2283   mmdata->cisdense = cisdense;
2284  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2285   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2286   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2287     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2288   }
2289  #endif
2290   /* for these products we need intermediate storage */
2291   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2292     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2293     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2294     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2295       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2296     } else {
2297       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2298     }
2299   }
2300   C->product->data    = mmdata;
2301   C->product->destroy = MatDestroy_MatMatCusparse;
2302 
2303   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2304   PetscFunctionReturn(0);
2305 }
2306 
2307 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2308 {
2309   Mat_Product                  *product = C->product;
2310   Mat                          A,B;
2311   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2312   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2313   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2314   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2315   PetscBool                    flg;
2316   PetscErrorCode               ierr;
2317   cusparseStatus_t             stat;
2318   cudaError_t                  cerr;
2319   MatProductType               ptype;
2320   MatMatCusparse               *mmdata;
2321 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2322   cusparseSpMatDescr_t         BmatSpDescr;
2323 #endif
2324   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2325 
2326   PetscFunctionBegin;
2327   MatCheckProduct(C,1);
2328   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2329   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2330   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2331   mmdata = (MatMatCusparse*)C->product->data;
2332   A = product->A;
2333   B = product->B;
2334   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2335     mmdata->reusesym = PETSC_FALSE;
2336     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2337     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2338     Cmat = Ccusp->mat;
2339     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2340     Ccsr = (CsrMatrix*)Cmat->mat;
2341     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2342     goto finalize;
2343   }
2344   if (!c->nz) goto finalize;
2345   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2346   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2347   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2348   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2349   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2350   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2351   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2352   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2353   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2354   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2355   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2356   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2357   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2358   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2359 
2360   ptype = product->type;
2361   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2362     ptype = MATPRODUCT_AB;
2363     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2364   }
2365   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2366     ptype = MATPRODUCT_AB;
2367     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2368   }
2369   switch (ptype) {
2370   case MATPRODUCT_AB:
2371     Amat = Acusp->mat;
2372     Bmat = Bcusp->mat;
2373     break;
2374   case MATPRODUCT_AtB:
2375     Amat = Acusp->matTranspose;
2376     Bmat = Bcusp->mat;
2377     break;
2378   case MATPRODUCT_ABt:
2379     Amat = Acusp->mat;
2380     Bmat = Bcusp->matTranspose;
2381     break;
2382   default:
2383     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2384   }
2385   Cmat = Ccusp->mat;
2386   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2387   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2388   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2389   Acsr = (CsrMatrix*)Amat->mat;
2390   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2391   Ccsr = (CsrMatrix*)Cmat->mat;
2392   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2393   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2394   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2395   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2397   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2398   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2399   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2400     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2401                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2402                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2403                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2404   #else
2405     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2406                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2407                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2408                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2409     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2410                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2411                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2412   #endif
2413 #else
2414   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2415                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2416                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2417                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2418                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2419 #endif
2420   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2421   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2422   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2423   C->offloadmask = PETSC_OFFLOAD_GPU;
2424 finalize:
2425   /* shorter version of MatAssemblyEnd_SeqAIJ */
2426   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2427   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2428   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2429   c->reallocs         = 0;
2430   C->info.mallocs    += 0;
2431   C->info.nz_unneeded = 0;
2432   C->assembled = C->was_assembled = PETSC_TRUE;
2433   C->num_ass++;
2434   PetscFunctionReturn(0);
2435 }
2436 
2437 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2438 {
2439   Mat_Product                  *product = C->product;
2440   Mat                          A,B;
2441   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2442   Mat_SeqAIJ                   *a,*b,*c;
2443   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2444   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2445   PetscInt                     i,j,m,n,k;
2446   PetscBool                    flg;
2447   PetscErrorCode               ierr;
2448   cusparseStatus_t             stat;
2449   cudaError_t                  cerr;
2450   MatProductType               ptype;
2451   MatMatCusparse               *mmdata;
2452   PetscLogDouble               flops;
2453   PetscBool                    biscompressed,ciscompressed;
2454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2455   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2456   cusparseSpMatDescr_t         BmatSpDescr;
2457 #else
2458   int                          cnz;
2459 #endif
2460   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2461 
2462   PetscFunctionBegin;
2463   MatCheckProduct(C,1);
2464   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2465   A    = product->A;
2466   B    = product->B;
2467   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2468   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2469   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2470   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2471   a = (Mat_SeqAIJ*)A->data;
2472   b = (Mat_SeqAIJ*)B->data;
2473   /* product data */
2474   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2475   C->product->data    = mmdata;
2476   C->product->destroy = MatDestroy_MatMatCusparse;
2477 
2478   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2479   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2480   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2481   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2482   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2483   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2484 
2485   ptype = product->type;
2486   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2487     ptype = MATPRODUCT_AB;
2488     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2489   }
2490   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2491     ptype = MATPRODUCT_AB;
2492     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2493   }
2494   biscompressed = PETSC_FALSE;
2495   ciscompressed = PETSC_FALSE;
2496   switch (ptype) {
2497   case MATPRODUCT_AB:
2498     m = A->rmap->n;
2499     n = B->cmap->n;
2500     k = A->cmap->n;
2501     Amat = Acusp->mat;
2502     Bmat = Bcusp->mat;
2503     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2504     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2505     break;
2506   case MATPRODUCT_AtB:
2507     m = A->cmap->n;
2508     n = B->cmap->n;
2509     k = A->rmap->n;
2510     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2511     Amat = Acusp->matTranspose;
2512     Bmat = Bcusp->mat;
2513     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2514     break;
2515   case MATPRODUCT_ABt:
2516     m = A->rmap->n;
2517     n = B->rmap->n;
2518     k = A->cmap->n;
2519     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2520     Amat = Acusp->mat;
2521     Bmat = Bcusp->matTranspose;
2522     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2523     break;
2524   default:
2525     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2526   }
2527 
2528   /* create cusparse matrix */
2529   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2530   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2531   c     = (Mat_SeqAIJ*)C->data;
2532   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2533   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2534   Ccsr  = new CsrMatrix;
2535 
2536   c->compressedrow.use = ciscompressed;
2537   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2538     c->compressedrow.nrows = a->compressedrow.nrows;
2539     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2540     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2541     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2542     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2543     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2544   } else {
2545     c->compressedrow.nrows  = 0;
2546     c->compressedrow.i      = NULL;
2547     c->compressedrow.rindex = NULL;
2548     Ccusp->workVector       = NULL;
2549     Cmat->cprowIndices      = NULL;
2550   }
2551   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2552   Ccusp->mat      = Cmat;
2553   Ccusp->mat->mat = Ccsr;
2554   Ccsr->num_rows    = Ccusp->nrows;
2555   Ccsr->num_cols    = n;
2556   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2557   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2558   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2559   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2560   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2561   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2562   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2563   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2564   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2565   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2566   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2567     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2568     c->nz = 0;
2569     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2570     Ccsr->values = new THRUSTARRAY(c->nz);
2571     goto finalizesym;
2572   }
2573 
2574   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2575   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2576   Acsr = (CsrMatrix*)Amat->mat;
2577   if (!biscompressed) {
2578     Bcsr = (CsrMatrix*)Bmat->mat;
2579 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2580     BmatSpDescr = Bmat->matDescr;
2581 #endif
2582   } else { /* we need to use row offsets for the full matrix */
2583     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2584     Bcsr = new CsrMatrix;
2585     Bcsr->num_rows       = B->rmap->n;
2586     Bcsr->num_cols       = cBcsr->num_cols;
2587     Bcsr->num_entries    = cBcsr->num_entries;
2588     Bcsr->column_indices = cBcsr->column_indices;
2589     Bcsr->values         = cBcsr->values;
2590     if (!Bcusp->rowoffsets_gpu) {
2591       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2592       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2593       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2594     }
2595     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2596     mmdata->Bcsr = Bcsr;
2597 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2598     if (Bcsr->num_rows && Bcsr->num_cols) {
2599       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2600                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2601                                Bcsr->values->data().get(),
2602                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2603                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2604     }
2605     BmatSpDescr = mmdata->matSpBDescr;
2606 #endif
2607   }
2608   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2609   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2610   /* precompute flops count */
2611   if (ptype == MATPRODUCT_AB) {
2612     for (i=0, flops = 0; i<A->rmap->n; i++) {
2613       const PetscInt st = a->i[i];
2614       const PetscInt en = a->i[i+1];
2615       for (j=st; j<en; j++) {
2616         const PetscInt brow = a->j[j];
2617         flops += 2.*(b->i[brow+1] - b->i[brow]);
2618       }
2619     }
2620   } else if (ptype == MATPRODUCT_AtB) {
2621     for (i=0, flops = 0; i<A->rmap->n; i++) {
2622       const PetscInt anzi = a->i[i+1] - a->i[i];
2623       const PetscInt bnzi = b->i[i+1] - b->i[i];
2624       flops += (2.*anzi)*bnzi;
2625     }
2626   } else { /* TODO */
2627     flops = 0.;
2628   }
2629 
2630   mmdata->flops = flops;
2631   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2632 
2633 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2634   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2635   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2636                           NULL, NULL, NULL,
2637                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2638                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2639   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2640  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2641  {
2642   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2643      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2644   */
2645   void*  dBuffer1 = NULL;
2646   void*  dBuffer2 = NULL;
2647   void*  dBuffer3 = NULL;
2648   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2649   size_t bufferSize1 = 0;
2650   size_t bufferSize2 = 0;
2651   size_t bufferSize3 = 0;
2652   size_t bufferSize4 = 0;
2653   size_t bufferSize5 = 0;
2654 
2655   /*----------------------------------------------------------------------*/
2656   /* ask bufferSize1 bytes for external memory */
2657   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2658                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2659                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2660   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2661   /* inspect the matrices A and B to understand the memory requirement for the next step */
2662   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2663                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2664                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2665 
2666   /*----------------------------------------------------------------------*/
2667   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2668                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2669                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2670   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2671   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2672   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2673   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2674                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2675                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2676   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2677   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2678 
2679   /*----------------------------------------------------------------------*/
2680   /* get matrix C non-zero entries C_nnz1 */
2681   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2682   c->nz = (PetscInt) C_nnz1;
2683   /* allocate matrix C */
2684   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2685   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2686   /* update matC with the new pointers */
2687   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2688                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2689 
2690   /*----------------------------------------------------------------------*/
2691   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2692                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2693                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2694   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2695   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2696                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2697                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2698   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2699   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2700                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2701                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2702                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2703   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2704  }
2705  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2706   size_t bufSize2;
2707   /* ask bufferSize bytes for external memory */
2708   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2709                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2710                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2711                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2712   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2713   /* inspect the matrices A and B to understand the memory requirement for the next step */
2714   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2715                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2716                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2717                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2718   /* ask bufferSize again bytes for external memory */
2719   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2720                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2721                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2722                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2723   /* The CUSPARSE documentation is not clear, nor the API
2724      We need both buffers to perform the operations properly!
2725      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2726      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2727      is stored in the descriptor! What a messy API... */
2728   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2729   /* compute the intermediate product of A * B */
2730   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2731                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2732                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2733                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2734   /* get matrix C non-zero entries C_nnz1 */
2735   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2736   c->nz = (PetscInt) C_nnz1;
2737   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2738   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2739   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2740   Ccsr->values = new THRUSTARRAY(c->nz);
2741   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2742   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2743                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2744   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2745                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2746                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2747  #endif
2748 #else
2749   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2750   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2751                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2752                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2753                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2754                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2755   c->nz = cnz;
2756   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2757   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2758   Ccsr->values = new THRUSTARRAY(c->nz);
2759   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2760 
2761   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2762   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2763      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2764      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2765   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2766                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2767                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2768                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2769                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2770 #endif
2771   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2772   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2773 finalizesym:
2774   c->singlemalloc = PETSC_FALSE;
2775   c->free_a       = PETSC_TRUE;
2776   c->free_ij      = PETSC_TRUE;
2777   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2778   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2779   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2780     PetscInt *d_i = c->i;
2781     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2782     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2783     ii   = *Ccsr->row_offsets;
2784     jj   = *Ccsr->column_indices;
2785     if (ciscompressed) d_i = c->compressedrow.i;
2786     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2787     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2788   } else {
2789     PetscInt *d_i = c->i;
2790     if (ciscompressed) d_i = c->compressedrow.i;
2791     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2792     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2793   }
2794   if (ciscompressed) { /* need to expand host row offsets */
2795     PetscInt r = 0;
2796     c->i[0] = 0;
2797     for (k = 0; k < c->compressedrow.nrows; k++) {
2798       const PetscInt next = c->compressedrow.rindex[k];
2799       const PetscInt old = c->compressedrow.i[k];
2800       for (; r < next; r++) c->i[r+1] = old;
2801     }
2802     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2803   }
2804   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2805   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2806   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2807   c->maxnz = c->nz;
2808   c->nonzerorowcnt = 0;
2809   c->rmax = 0;
2810   for (k = 0; k < m; k++) {
2811     const PetscInt nn = c->i[k+1] - c->i[k];
2812     c->ilen[k] = c->imax[k] = nn;
2813     c->nonzerorowcnt += (PetscInt)!!nn;
2814     c->rmax = PetscMax(c->rmax,nn);
2815   }
2816   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2817   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2818   Ccsr->num_entries = c->nz;
2819 
2820   C->nonzerostate++;
2821   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2822   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2823   Ccusp->nonzerostate = C->nonzerostate;
2824   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2825   C->preallocated  = PETSC_TRUE;
2826   C->assembled     = PETSC_FALSE;
2827   C->was_assembled = PETSC_FALSE;
2828   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2829     mmdata->reusesym = PETSC_TRUE;
2830     C->offloadmask   = PETSC_OFFLOAD_GPU;
2831   }
2832   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2833   PetscFunctionReturn(0);
2834 }
2835 
2836 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2837 
2838 /* handles sparse or dense B */
2839 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2840 {
2841   Mat_Product    *product = mat->product;
2842   PetscErrorCode ierr;
2843   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2844 
2845   PetscFunctionBegin;
2846   MatCheckProduct(mat,1);
2847   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2848   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2849     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2850   }
2851   if (product->type == MATPRODUCT_ABC) {
2852     Ciscusp = PETSC_FALSE;
2853     if (!product->C->boundtocpu) {
2854       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2855     }
2856   }
2857   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2858     PetscBool usecpu = PETSC_FALSE;
2859     switch (product->type) {
2860     case MATPRODUCT_AB:
2861       if (product->api_user) {
2862         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2863         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2864         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2865       } else {
2866         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2867         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2868         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2869       }
2870       break;
2871     case MATPRODUCT_AtB:
2872       if (product->api_user) {
2873         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2874         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2875         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2876       } else {
2877         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2878         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2879         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2880       }
2881       break;
2882     case MATPRODUCT_PtAP:
2883       if (product->api_user) {
2884         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2885         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2886         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2887       } else {
2888         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2889         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2890         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2891       }
2892       break;
2893     case MATPRODUCT_RARt:
2894       if (product->api_user) {
2895         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2896         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2897         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2898       } else {
2899         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2900         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2901         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2902       }
2903       break;
2904     case MATPRODUCT_ABC:
2905       if (product->api_user) {
2906         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2907         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2908         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2909       } else {
2910         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2911         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2912         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2913       }
2914       break;
2915     default:
2916       break;
2917     }
2918     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2919   }
2920   /* dispatch */
2921   if (isdense) {
2922     switch (product->type) {
2923     case MATPRODUCT_AB:
2924     case MATPRODUCT_AtB:
2925     case MATPRODUCT_ABt:
2926     case MATPRODUCT_PtAP:
2927     case MATPRODUCT_RARt:
2928      if (product->A->boundtocpu) {
2929         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2930       } else {
2931         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2932       }
2933       break;
2934     case MATPRODUCT_ABC:
2935       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2936       break;
2937     default:
2938       break;
2939     }
2940   } else if (Biscusp && Ciscusp) {
2941     switch (product->type) {
2942     case MATPRODUCT_AB:
2943     case MATPRODUCT_AtB:
2944     case MATPRODUCT_ABt:
2945       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2946       break;
2947     case MATPRODUCT_PtAP:
2948     case MATPRODUCT_RARt:
2949     case MATPRODUCT_ABC:
2950       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2951       break;
2952     default:
2953       break;
2954     }
2955   } else { /* fallback for AIJ */
2956     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2957   }
2958   PetscFunctionReturn(0);
2959 }
2960 
2961 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2962 {
2963   PetscErrorCode ierr;
2964 
2965   PetscFunctionBegin;
2966   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2967   PetscFunctionReturn(0);
2968 }
2969 
2970 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2971 {
2972   PetscErrorCode ierr;
2973 
2974   PetscFunctionBegin;
2975   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2976   PetscFunctionReturn(0);
2977 }
2978 
2979 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2980 {
2981   PetscErrorCode ierr;
2982 
2983   PetscFunctionBegin;
2984   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2985   PetscFunctionReturn(0);
2986 }
2987 
2988 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2989 {
2990   PetscErrorCode ierr;
2991 
2992   PetscFunctionBegin;
2993   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2994   PetscFunctionReturn(0);
2995 }
2996 
2997 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2998 {
2999   PetscErrorCode ierr;
3000 
3001   PetscFunctionBegin;
3002   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3003   PetscFunctionReturn(0);
3004 }
3005 
3006 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3007 {
3008   int i = blockIdx.x*blockDim.x + threadIdx.x;
3009   if (i < n) y[idx[i]] += x[i];
3010 }
3011 
3012 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3013 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3014 {
3015   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3016   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3017   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3018   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3019   PetscErrorCode               ierr;
3020   cusparseStatus_t             stat;
3021   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3022   PetscBool                    compressed;
3023 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3024   PetscInt                     nx,ny;
3025 #endif
3026 
3027   PetscFunctionBegin;
3028   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3029   if (!a->nonzerorowcnt) {
3030     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3031     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3032     PetscFunctionReturn(0);
3033   }
3034   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3035   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3036   if (!trans) {
3037     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3038     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3039   } else {
3040     if (herm || !A->form_explicit_transpose) {
3041       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3042       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3043     } else {
3044       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3045       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3046     }
3047   }
3048   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3049   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3050 
3051   try {
3052     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3053     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3054     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3055 
3056     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3057     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3058       /* z = A x + beta y.
3059          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3060          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3061       */
3062       xptr = xarray;
3063       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3064       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3065      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3066       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3067           allocated to accommodate different uses. So we get the length info directly from mat.
3068        */
3069       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3070         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3071         nx = mat->num_cols;
3072         ny = mat->num_rows;
3073       }
3074      #endif
3075     } else {
3076       /* z = A^T x + beta y
3077          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3078          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3079        */
3080       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3081       dptr = zarray;
3082       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3083       if (compressed) { /* Scatter x to work vector */
3084         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3085         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3086                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3087                          VecCUDAEqualsReverse());
3088       }
3089      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3090       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3091         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3092         nx = mat->num_rows;
3093         ny = mat->num_cols;
3094       }
3095      #endif
3096     }
3097 
3098     /* csr_spmv does y = alpha op(A) x + beta y */
3099     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3100      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3101       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3102       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3103         cudaError_t cerr;
3104         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3105         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3106         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3107                                 matstruct->matDescr,
3108                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3109                                 matstruct->cuSpMV[opA].vecYDescr,
3110                                 cusparse_scalartype,
3111                                 cusparsestruct->spmvAlg,
3112                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3113         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3114 
3115         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3116       } else {
3117         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3118         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3119         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3120       }
3121 
3122       stat = cusparseSpMV(cusparsestruct->handle, opA,
3123                                matstruct->alpha_one,
3124                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3125                                matstruct->cuSpMV[opA].vecXDescr,
3126                                beta,
3127                                matstruct->cuSpMV[opA].vecYDescr,
3128                                cusparse_scalartype,
3129                                cusparsestruct->spmvAlg,
3130                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3131      #else
3132       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3133       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3134                                mat->num_rows, mat->num_cols,
3135                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3136                                mat->values->data().get(), mat->row_offsets->data().get(),
3137                                mat->column_indices->data().get(), xptr, beta,
3138                                dptr);CHKERRCUSPARSE(stat);
3139      #endif
3140     } else {
3141       if (cusparsestruct->nrows) {
3142        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3143         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3144        #else
3145         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3146         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3147                                  matstruct->alpha_one, matstruct->descr, hybMat,
3148                                  xptr, beta,
3149                                  dptr);CHKERRCUSPARSE(stat);
3150        #endif
3151       }
3152     }
3153     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3154 
3155     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3156       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3157         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3158           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3159         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3160           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3161         }
3162       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3163         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3164       }
3165 
3166       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3167       if (compressed) {
3168         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3169         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3170            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3171            prevent that. So I just add a ScatterAdd kernel.
3172          */
3173        #if 0
3174         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3175         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3176                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3177                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3178                          VecCUDAPlusEquals());
3179        #else
3180         PetscInt n = matstruct->cprowIndices->size();
3181         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3182        #endif
3183         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3184       }
3185     } else {
3186       if (yy && yy != zz) {
3187         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3188       }
3189     }
3190     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3191     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3192     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3193   } catch(char *ex) {
3194     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3195   }
3196   if (yy) {
3197     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3198   } else {
3199     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3200   }
3201   PetscFunctionReturn(0);
3202 }
3203 
3204 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3205 {
3206   PetscErrorCode ierr;
3207 
3208   PetscFunctionBegin;
3209   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3210   PetscFunctionReturn(0);
3211 }
3212 
3213 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3214 {
3215   PetscErrorCode     ierr;
3216   PetscObjectState   onnz = A->nonzerostate;
3217   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3218 
3219   PetscFunctionBegin;
3220   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3221   if (onnz != A->nonzerostate && cusp->deviceMat) {
3222     cudaError_t cerr;
3223 
3224     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3225     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3226     cusp->deviceMat = NULL;
3227   }
3228   PetscFunctionReturn(0);
3229 }
3230 
3231 /* --------------------------------------------------------------------------------*/
3232 /*@
3233    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3234    (the default parallel PETSc format). This matrix will ultimately pushed down
3235    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3236    assembly performance the user should preallocate the matrix storage by setting
3237    the parameter nz (or the array nnz).  By setting these parameters accurately,
3238    performance during matrix assembly can be increased by more than a factor of 50.
3239 
3240    Collective
3241 
3242    Input Parameters:
3243 +  comm - MPI communicator, set to PETSC_COMM_SELF
3244 .  m - number of rows
3245 .  n - number of columns
3246 .  nz - number of nonzeros per row (same for all rows)
3247 -  nnz - array containing the number of nonzeros in the various rows
3248          (possibly different for each row) or NULL
3249 
3250    Output Parameter:
3251 .  A - the matrix
3252 
3253    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3254    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3255    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3256 
3257    Notes:
3258    If nnz is given then nz is ignored
3259 
3260    The AIJ format (also called the Yale sparse matrix format or
3261    compressed row storage), is fully compatible with standard Fortran 77
3262    storage.  That is, the stored row and column indices can begin at
3263    either one (as in Fortran) or zero.  See the users' manual for details.
3264 
3265    Specify the preallocated storage with either nz or nnz (not both).
3266    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3267    allocation.  For large problems you MUST preallocate memory or you
3268    will get TERRIBLE performance, see the users' manual chapter on matrices.
3269 
3270    By default, this format uses inodes (identical nodes) when possible, to
3271    improve numerical efficiency of matrix-vector products and solves. We
3272    search for consecutive rows with the same nonzero structure, thereby
3273    reusing matrix information to achieve increased efficiency.
3274 
3275    Level: intermediate
3276 
3277 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3278 @*/
3279 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3280 {
3281   PetscErrorCode ierr;
3282 
3283   PetscFunctionBegin;
3284   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3285   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3286   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3287   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3288   PetscFunctionReturn(0);
3289 }
3290 
3291 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3292 {
3293   PetscErrorCode ierr;
3294 
3295   PetscFunctionBegin;
3296   if (A->factortype == MAT_FACTOR_NONE) {
3297     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3298   } else {
3299     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3300   }
3301   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3302   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3303   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3304   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3305   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3306   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3307   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3308   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3309   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3310   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3311   PetscFunctionReturn(0);
3312 }
3313 
3314 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3315 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3316 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3317 {
3318   PetscErrorCode ierr;
3319 
3320   PetscFunctionBegin;
3321   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3322   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3323   PetscFunctionReturn(0);
3324 }
3325 
3326 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3327 {
3328   PetscErrorCode     ierr;
3329   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3330   Mat_SeqAIJCUSPARSE *cy;
3331   Mat_SeqAIJCUSPARSE *cx;
3332   PetscScalar        *ay;
3333   const PetscScalar  *ax;
3334   CsrMatrix          *csry,*csrx;
3335 
3336   PetscFunctionBegin;
3337   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3338   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3339   if (X->ops->axpy != Y->ops->axpy) {
3340     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3341     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3342     PetscFunctionReturn(0);
3343   }
3344   /* if we are here, it means both matrices are bound to GPU */
3345   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3346   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3347   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3348   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3349   csry = (CsrMatrix*)cy->mat->mat;
3350   csrx = (CsrMatrix*)cx->mat->mat;
3351   /* see if we can turn this into a cublas axpy */
3352   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3353     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3354     if (eq) {
3355       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3356     }
3357     if (eq) str = SAME_NONZERO_PATTERN;
3358   }
3359   /* spgeam is buggy with one column */
3360   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3361 
3362   if (str == SUBSET_NONZERO_PATTERN) {
3363     cusparseStatus_t stat;
3364     PetscScalar      b = 1.0;
3365 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3366     size_t           bufferSize;
3367     void             *buffer;
3368     cudaError_t      cerr;
3369 #endif
3370 
3371     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3372     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3373     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3374 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3375     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3376                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3377                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3378                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3379     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3380     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3381     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3382                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3383                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3384                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3385     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3386     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3387     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3388 #else
3389     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3390     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3391                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3392                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3393                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3394     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3395     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3396 #endif
3397     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3398     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3399     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3400     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3401   } else if (str == SAME_NONZERO_PATTERN) {
3402     cublasHandle_t cublasv2handle;
3403     cublasStatus_t berr;
3404     PetscBLASInt   one = 1, bnz = 1;
3405 
3406     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3407     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3408     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3409     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3410     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3411     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3412     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3413     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3414     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3415     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3416     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3417   } else {
3418     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3419     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3420   }
3421   PetscFunctionReturn(0);
3422 }
3423 
3424 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3425 {
3426   PetscErrorCode ierr;
3427   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3428   PetscScalar    *ay;
3429   cublasHandle_t cublasv2handle;
3430   cublasStatus_t berr;
3431   PetscBLASInt   one = 1, bnz = 1;
3432 
3433   PetscFunctionBegin;
3434   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3435   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3436   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3437   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3438   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3439   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3440   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3441   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3442   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3443   PetscFunctionReturn(0);
3444 }
3445 
3446 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3447 {
3448   PetscErrorCode ierr;
3449   PetscBool      both = PETSC_FALSE;
3450   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3451 
3452   PetscFunctionBegin;
3453   if (A->factortype == MAT_FACTOR_NONE) {
3454     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3455     if (spptr->mat) {
3456       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3457       if (matrix->values) {
3458         both = PETSC_TRUE;
3459         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3460       }
3461     }
3462     if (spptr->matTranspose) {
3463       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3464       if (matrix->values) {
3465         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3466       }
3467     }
3468   }
3469   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3470   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3471   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3472   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3473   else A->offloadmask = PETSC_OFFLOAD_CPU;
3474   PetscFunctionReturn(0);
3475 }
3476 
3477 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3478 {
3479   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3480   PetscErrorCode ierr;
3481 
3482   PetscFunctionBegin;
3483   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3484   if (flg) {
3485     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3486 
3487     A->ops->scale                     = MatScale_SeqAIJ;
3488     A->ops->axpy                      = MatAXPY_SeqAIJ;
3489     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3490     A->ops->mult                      = MatMult_SeqAIJ;
3491     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3492     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3493     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3494     A->ops->multhermitiantranspose    = NULL;
3495     A->ops->multhermitiantransposeadd = NULL;
3496     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3497     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3498     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3499     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3500     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3501     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3502     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3503     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3504     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3505   } else {
3506     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3507     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3508     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3509     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3510     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3511     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3512     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3513     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3514     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3515     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3516     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3517     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3518     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3519     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3520     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3521     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3522     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3523     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3524     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3525     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3526     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3527     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3528   }
3529   A->boundtocpu = flg;
3530   if (flg && a->inode.size) {
3531     a->inode.use = PETSC_TRUE;
3532   } else {
3533     a->inode.use = PETSC_FALSE;
3534   }
3535   PetscFunctionReturn(0);
3536 }
3537 
3538 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3539 {
3540   PetscErrorCode   ierr;
3541   cusparseStatus_t stat;
3542   Mat              B;
3543 
3544   PetscFunctionBegin;
3545   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3546   if (reuse == MAT_INITIAL_MATRIX) {
3547     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3548   } else if (reuse == MAT_REUSE_MATRIX) {
3549     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3550   }
3551   B = *newmat;
3552 
3553   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3554   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3555 
3556   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3557     if (B->factortype == MAT_FACTOR_NONE) {
3558       Mat_SeqAIJCUSPARSE *spptr;
3559       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3560       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3561       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3562       spptr->format     = MAT_CUSPARSE_CSR;
3563      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3564      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3565       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3566      #else
3567       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3568      #endif
3569       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3570       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3571      #endif
3572       B->spptr = spptr;
3573     } else {
3574       Mat_SeqAIJCUSPARSETriFactors *spptr;
3575 
3576       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3577       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3578       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3579       B->spptr = spptr;
3580     }
3581     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3582   }
3583   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3584   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3585   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3586   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3587   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3588   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3589 
3590   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3591   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3592   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3593 #if defined(PETSC_HAVE_HYPRE)
3594   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3595 #endif
3596   PetscFunctionReturn(0);
3597 }
3598 
3599 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3600 {
3601   PetscErrorCode ierr;
3602 
3603   PetscFunctionBegin;
3604   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3605   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3606   PetscFunctionReturn(0);
3607 }
3608 
3609 /*MC
3610    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3611 
3612    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3613    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3614    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3615 
3616    Options Database Keys:
3617 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3618 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3619 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3620 
3621   Level: beginner
3622 
3623 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3624 M*/
3625 
3626 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3627 
3628 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3629 {
3630   PetscErrorCode ierr;
3631 
3632   PetscFunctionBegin;
3633   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3634   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3635   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3636   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3637   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3638 
3639   PetscFunctionReturn(0);
3640 }
3641 
3642 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3643 {
3644   PetscErrorCode   ierr;
3645   cusparseStatus_t stat;
3646 
3647   PetscFunctionBegin;
3648   if (*cusparsestruct) {
3649     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3650     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3651     delete (*cusparsestruct)->workVector;
3652     delete (*cusparsestruct)->rowoffsets_gpu;
3653     delete (*cusparsestruct)->cooPerm;
3654     delete (*cusparsestruct)->cooPerm_a;
3655     delete (*cusparsestruct)->csr2csc_i;
3656     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3657     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3658   }
3659   PetscFunctionReturn(0);
3660 }
3661 
3662 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3663 {
3664   PetscFunctionBegin;
3665   if (*mat) {
3666     delete (*mat)->values;
3667     delete (*mat)->column_indices;
3668     delete (*mat)->row_offsets;
3669     delete *mat;
3670     *mat = 0;
3671   }
3672   PetscFunctionReturn(0);
3673 }
3674 
3675 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3676 {
3677   cusparseStatus_t stat;
3678   PetscErrorCode   ierr;
3679 
3680   PetscFunctionBegin;
3681   if (*trifactor) {
3682     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3683     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3684     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3685     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3686     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3687    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3688     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3689    #endif
3690     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3691   }
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3696 {
3697   CsrMatrix        *mat;
3698   cusparseStatus_t stat;
3699   cudaError_t      err;
3700 
3701   PetscFunctionBegin;
3702   if (*matstruct) {
3703     if ((*matstruct)->mat) {
3704       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3705        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3706         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3707        #else
3708         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3709         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3710        #endif
3711       } else {
3712         mat = (CsrMatrix*)(*matstruct)->mat;
3713         CsrMatrix_Destroy(&mat);
3714       }
3715     }
3716     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3717     delete (*matstruct)->cprowIndices;
3718     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3719     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3720     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3721 
3722    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3723     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3724     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3725     for (int i=0; i<3; i++) {
3726       if (mdata->cuSpMV[i].initialized) {
3727         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3728         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3729         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3730       }
3731     }
3732    #endif
3733     delete *matstruct;
3734     *matstruct = NULL;
3735   }
3736   PetscFunctionReturn(0);
3737 }
3738 
3739 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3740 {
3741   PetscErrorCode ierr;
3742 
3743   PetscFunctionBegin;
3744   if (*trifactors) {
3745     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3746     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3747     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3748     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3749     delete (*trifactors)->rpermIndices;
3750     delete (*trifactors)->cpermIndices;
3751     delete (*trifactors)->workVector;
3752     (*trifactors)->rpermIndices = NULL;
3753     (*trifactors)->cpermIndices = NULL;
3754     (*trifactors)->workVector = NULL;
3755     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3756     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3757     (*trifactors)->init_dev_prop = PETSC_FALSE;
3758   }
3759   PetscFunctionReturn(0);
3760 }
3761 
3762 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3763 {
3764   PetscErrorCode   ierr;
3765   cusparseHandle_t handle;
3766   cusparseStatus_t stat;
3767 
3768   PetscFunctionBegin;
3769   if (*trifactors) {
3770     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3771     if (handle = (*trifactors)->handle) {
3772       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3773     }
3774     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3775   }
3776   PetscFunctionReturn(0);
3777 }
3778 
3779 struct IJCompare
3780 {
3781   __host__ __device__
3782   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3783   {
3784     if (t1.get<0>() < t2.get<0>()) return true;
3785     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3786     return false;
3787   }
3788 };
3789 
3790 struct IJEqual
3791 {
3792   __host__ __device__
3793   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3794   {
3795     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3796     return true;
3797   }
3798 };
3799 
3800 struct IJDiff
3801 {
3802   __host__ __device__
3803   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3804   {
3805     return t1 == t2 ? 0 : 1;
3806   }
3807 };
3808 
3809 struct IJSum
3810 {
3811   __host__ __device__
3812   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3813   {
3814     return t1||t2;
3815   }
3816 };
3817 
3818 #include <thrust/iterator/discard_iterator.h>
3819 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3820 {
3821   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3822   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3823   THRUSTARRAY                           *cooPerm_v = NULL;
3824   thrust::device_ptr<const PetscScalar> d_v;
3825   CsrMatrix                             *matrix;
3826   PetscErrorCode                        ierr;
3827   PetscInt                              n;
3828 
3829   PetscFunctionBegin;
3830   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3831   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3832   if (!cusp->cooPerm) {
3833     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3834     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3835     PetscFunctionReturn(0);
3836   }
3837   matrix = (CsrMatrix*)cusp->mat->mat;
3838   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3839   if (!v) {
3840     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3841     goto finalize;
3842   }
3843   n = cusp->cooPerm->size();
3844   if (isCudaMem(v)) {
3845     d_v = thrust::device_pointer_cast(v);
3846   } else {
3847     cooPerm_v = new THRUSTARRAY(n);
3848     cooPerm_v->assign(v,v+n);
3849     d_v = cooPerm_v->data();
3850     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3851   }
3852   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3853   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3854     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3855       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3856       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3857       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3858         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3859         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3860       */
3861       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3862       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3863       delete cooPerm_w;
3864     } else {
3865       /* all nonzeros in d_v[] are unique entries */
3866       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3867                                                                 matrix->values->begin()));
3868       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3869                                                                 matrix->values->end()));
3870       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3871     }
3872   } else {
3873     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3874       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3875       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3876     } else {
3877       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3878                                                                 matrix->values->begin()));
3879       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3880                                                                 matrix->values->end()));
3881       thrust::for_each(zibit,zieit,VecCUDAEquals());
3882     }
3883   }
3884   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3885 finalize:
3886   delete cooPerm_v;
3887   A->offloadmask = PETSC_OFFLOAD_GPU;
3888   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3889   /* shorter version of MatAssemblyEnd_SeqAIJ */
3890   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3891   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3892   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3893   a->reallocs         = 0;
3894   A->info.mallocs    += 0;
3895   A->info.nz_unneeded = 0;
3896   A->assembled = A->was_assembled = PETSC_TRUE;
3897   A->num_ass++;
3898   PetscFunctionReturn(0);
3899 }
3900 
3901 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3902 {
3903   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3904   PetscErrorCode     ierr;
3905 
3906   PetscFunctionBegin;
3907   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3908   if (!cusp) PetscFunctionReturn(0);
3909   if (destroy) {
3910     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3911     delete cusp->csr2csc_i;
3912     cusp->csr2csc_i = NULL;
3913   }
3914   A->transupdated = PETSC_FALSE;
3915   PetscFunctionReturn(0);
3916 }
3917 
3918 #include <thrust/binary_search.h>
3919 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3920 {
3921   PetscErrorCode     ierr;
3922   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3923   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3924   PetscInt           cooPerm_n, nzr = 0;
3925   cudaError_t        cerr;
3926 
3927   PetscFunctionBegin;
3928   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3929   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3930   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3931   if (n != cooPerm_n) {
3932     delete cusp->cooPerm;
3933     delete cusp->cooPerm_a;
3934     cusp->cooPerm = NULL;
3935     cusp->cooPerm_a = NULL;
3936   }
3937   if (n) {
3938     THRUSTINTARRAY d_i(n);
3939     THRUSTINTARRAY d_j(n);
3940     THRUSTINTARRAY ii(A->rmap->n);
3941 
3942     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3943     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3944 
3945     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3946     d_i.assign(coo_i,coo_i+n);
3947     d_j.assign(coo_j,coo_j+n);
3948 
3949     /* Ex.
3950       n = 6
3951       coo_i = [3,3,1,4,1,4]
3952       coo_j = [3,2,2,5,2,6]
3953     */
3954     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3955     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3956 
3957     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3958     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3959     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3960     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3961     THRUSTINTARRAY w = d_j;
3962 
3963     /*
3964       d_i     = [1,1,3,3,4,4]
3965       d_j     = [2,2,2,3,5,6]
3966       cooPerm = [2,4,1,0,3,5]
3967     */
3968     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3969 
3970     /*
3971       d_i     = [1,3,3,4,4,x]
3972                             ^ekey
3973       d_j     = [2,2,3,5,6,x]
3974                            ^nekye
3975     */
3976     if (nekey == ekey) { /* all entries are unique */
3977       delete cusp->cooPerm_a;
3978       cusp->cooPerm_a = NULL;
3979     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3980       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3981       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3982       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3983       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3984       w[0] = 0;
3985       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3986       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3987     }
3988     thrust::counting_iterator<PetscInt> search_begin(0);
3989     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3990                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3991                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3992     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3993 
3994     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3995     a->singlemalloc = PETSC_FALSE;
3996     a->free_a       = PETSC_TRUE;
3997     a->free_ij      = PETSC_TRUE;
3998     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3999     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4000     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4001     a->nz = a->maxnz = a->i[A->rmap->n];
4002     a->rmax = 0;
4003     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
4004     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
4005     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4006     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
4007     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
4008     for (PetscInt i = 0; i < A->rmap->n; i++) {
4009       const PetscInt nnzr = a->i[i+1] - a->i[i];
4010       nzr += (PetscInt)!!(nnzr);
4011       a->ilen[i] = a->imax[i] = nnzr;
4012       a->rmax = PetscMax(a->rmax,nnzr);
4013     }
4014     a->nonzerorowcnt = nzr;
4015     A->preallocated = PETSC_TRUE;
4016     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4017     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
4018   } else {
4019     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
4020   }
4021   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
4022 
4023   /* We want to allocate the CUSPARSE struct for matvec now.
4024      The code is so convoluted now that I prefer to copy zeros */
4025   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
4026   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
4027   A->offloadmask = PETSC_OFFLOAD_CPU;
4028   A->nonzerostate++;
4029   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4030   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
4031 
4032   A->assembled = PETSC_FALSE;
4033   A->was_assembled = PETSC_FALSE;
4034   PetscFunctionReturn(0);
4035 }
4036 
4037 /*@C
4038     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4039 
4040    Not collective
4041 
4042     Input Parameters:
4043 +   A - the matrix
4044 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4045 
4046     Output Parameters:
4047 +   ia - the CSR row pointers
4048 -   ja - the CSR column indices
4049 
4050     Level: developer
4051 
4052     Notes:
4053       When compressed is true, the CSR structure does not contain empty rows
4054 
4055 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4056 @*/
4057 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4058 {
4059   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4060   CsrMatrix          *csr;
4061   PetscErrorCode     ierr;
4062   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4063 
4064   PetscFunctionBegin;
4065   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4066   if (!i || !j) PetscFunctionReturn(0);
4067   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4068   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4069   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4070   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4071   csr = (CsrMatrix*)cusp->mat->mat;
4072   if (i) {
4073     if (!compressed && a->compressedrow.use) { /* need full row offset */
4074       if (!cusp->rowoffsets_gpu) {
4075         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4076         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4077         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4078       }
4079       *i = cusp->rowoffsets_gpu->data().get();
4080     } else *i = csr->row_offsets->data().get();
4081   }
4082   if (j) *j = csr->column_indices->data().get();
4083   PetscFunctionReturn(0);
4084 }
4085 
4086 /*@C
4087     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4088 
4089    Not collective
4090 
4091     Input Parameters:
4092 +   A - the matrix
4093 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4094 
4095     Output Parameters:
4096 +   ia - the CSR row pointers
4097 -   ja - the CSR column indices
4098 
4099     Level: developer
4100 
4101 .seealso: MatSeqAIJCUSPARSEGetIJ()
4102 @*/
4103 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4104 {
4105   PetscFunctionBegin;
4106   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4107   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4108   if (i) *i = NULL;
4109   if (j) *j = NULL;
4110   PetscFunctionReturn(0);
4111 }
4112 
4113 /*@C
4114    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4115 
4116    Not Collective
4117 
4118    Input Parameter:
4119 .   A - a MATSEQAIJCUSPARSE matrix
4120 
4121    Output Parameter:
4122 .   a - pointer to the device data
4123 
4124    Level: developer
4125 
4126    Notes: may trigger host-device copies if up-to-date matrix data is on host
4127 
4128 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4129 @*/
4130 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4131 {
4132   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4133   CsrMatrix          *csr;
4134   PetscErrorCode     ierr;
4135 
4136   PetscFunctionBegin;
4137   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4138   PetscValidPointer(a,2);
4139   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4140   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4141   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4142   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4143   csr = (CsrMatrix*)cusp->mat->mat;
4144   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4145   *a = csr->values->data().get();
4146   PetscFunctionReturn(0);
4147 }
4148 
4149 /*@C
4150    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4151 
4152    Not Collective
4153 
4154    Input Parameter:
4155 .   A - a MATSEQAIJCUSPARSE matrix
4156 
4157    Output Parameter:
4158 .   a - pointer to the device data
4159 
4160    Level: developer
4161 
4162 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4163 @*/
4164 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4165 {
4166   PetscFunctionBegin;
4167   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4168   PetscValidPointer(a,2);
4169   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4170   *a = NULL;
4171   PetscFunctionReturn(0);
4172 }
4173 
4174 /*@C
4175    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4176 
4177    Not Collective
4178 
4179    Input Parameter:
4180 .   A - a MATSEQAIJCUSPARSE matrix
4181 
4182    Output Parameter:
4183 .   a - pointer to the device data
4184 
4185    Level: developer
4186 
4187    Notes: may trigger host-device copies if up-to-date matrix data is on host
4188 
4189 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4190 @*/
4191 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4192 {
4193   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4194   CsrMatrix          *csr;
4195   PetscErrorCode     ierr;
4196 
4197   PetscFunctionBegin;
4198   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4199   PetscValidPointer(a,2);
4200   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4201   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4202   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4203   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4204   csr = (CsrMatrix*)cusp->mat->mat;
4205   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4206   *a = csr->values->data().get();
4207   A->offloadmask = PETSC_OFFLOAD_GPU;
4208   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4209   PetscFunctionReturn(0);
4210 }
4211 /*@C
4212    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4213 
4214    Not Collective
4215 
4216    Input Parameter:
4217 .   A - a MATSEQAIJCUSPARSE matrix
4218 
4219    Output Parameter:
4220 .   a - pointer to the device data
4221 
4222    Level: developer
4223 
4224 .seealso: MatSeqAIJCUSPARSEGetArray()
4225 @*/
4226 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4227 {
4228   PetscErrorCode ierr;
4229 
4230   PetscFunctionBegin;
4231   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4232   PetscValidPointer(a,2);
4233   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4234   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4235   *a = NULL;
4236   PetscFunctionReturn(0);
4237 }
4238 
4239 /*@C
4240    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4241 
4242    Not Collective
4243 
4244    Input Parameter:
4245 .   A - a MATSEQAIJCUSPARSE matrix
4246 
4247    Output Parameter:
4248 .   a - pointer to the device data
4249 
4250    Level: developer
4251 
4252    Notes: does not trigger host-device copies and flags data validity on the GPU
4253 
4254 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4255 @*/
4256 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4257 {
4258   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4259   CsrMatrix          *csr;
4260   PetscErrorCode     ierr;
4261 
4262   PetscFunctionBegin;
4263   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4264   PetscValidPointer(a,2);
4265   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4266   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4267   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4268   csr = (CsrMatrix*)cusp->mat->mat;
4269   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4270   *a = csr->values->data().get();
4271   A->offloadmask = PETSC_OFFLOAD_GPU;
4272   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4273   PetscFunctionReturn(0);
4274 }
4275 
4276 /*@C
4277    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4278 
4279    Not Collective
4280 
4281    Input Parameter:
4282 .   A - a MATSEQAIJCUSPARSE matrix
4283 
4284    Output Parameter:
4285 .   a - pointer to the device data
4286 
4287    Level: developer
4288 
4289 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4290 @*/
4291 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4292 {
4293   PetscErrorCode ierr;
4294 
4295   PetscFunctionBegin;
4296   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4297   PetscValidPointer(a,2);
4298   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4299   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4300   *a = NULL;
4301   PetscFunctionReturn(0);
4302 }
4303 
4304 struct IJCompare4
4305 {
4306   __host__ __device__
4307   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4308   {
4309     if (t1.get<0>() < t2.get<0>()) return true;
4310     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4311     return false;
4312   }
4313 };
4314 
4315 struct Shift
4316 {
4317   int _shift;
4318 
4319   Shift(int shift) : _shift(shift) {}
4320   __host__ __device__
4321   inline int operator() (const int &c)
4322   {
4323     return c + _shift;
4324   }
4325 };
4326 
4327 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4328 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4329 {
4330   PetscErrorCode               ierr;
4331   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4332   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4333   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4334   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4335   PetscInt                     Annz,Bnnz;
4336   cusparseStatus_t             stat;
4337   PetscInt                     i,m,n,zero = 0;
4338   cudaError_t                  cerr;
4339 
4340   PetscFunctionBegin;
4341   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4342   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4343   PetscValidPointer(C,4);
4344   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4345   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4346   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4347   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4348   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4349   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4350   if (reuse == MAT_INITIAL_MATRIX) {
4351     m     = A->rmap->n;
4352     n     = A->cmap->n + B->cmap->n;
4353     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4354     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4355     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4356     c     = (Mat_SeqAIJ*)(*C)->data;
4357     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4358     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4359     Ccsr  = new CsrMatrix;
4360     Cmat->cprowIndices      = NULL;
4361     c->compressedrow.use    = PETSC_FALSE;
4362     c->compressedrow.nrows  = 0;
4363     c->compressedrow.i      = NULL;
4364     c->compressedrow.rindex = NULL;
4365     Ccusp->workVector       = NULL;
4366     Ccusp->nrows    = m;
4367     Ccusp->mat      = Cmat;
4368     Ccusp->mat->mat = Ccsr;
4369     Ccsr->num_rows  = m;
4370     Ccsr->num_cols  = n;
4371     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4372     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4373     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4374     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4375     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4376     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4377     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4378     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4379     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4380     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4381     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4382     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4383     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4384 
4385     Acsr = (CsrMatrix*)Acusp->mat->mat;
4386     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4387     Annz = (PetscInt)Acsr->column_indices->size();
4388     Bnnz = (PetscInt)Bcsr->column_indices->size();
4389     c->nz = Annz + Bnnz;
4390     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4391     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4392     Ccsr->values = new THRUSTARRAY(c->nz);
4393     Ccsr->num_entries = c->nz;
4394     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4395     if (c->nz) {
4396       auto Acoo = new THRUSTINTARRAY32(Annz);
4397       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4398       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4399       THRUSTINTARRAY32 *Aroff,*Broff;
4400 
4401       if (a->compressedrow.use) { /* need full row offset */
4402         if (!Acusp->rowoffsets_gpu) {
4403           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4404           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4405           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4406         }
4407         Aroff = Acusp->rowoffsets_gpu;
4408       } else Aroff = Acsr->row_offsets;
4409       if (b->compressedrow.use) { /* need full row offset */
4410         if (!Bcusp->rowoffsets_gpu) {
4411           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4412           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4413           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4414         }
4415         Broff = Bcusp->rowoffsets_gpu;
4416       } else Broff = Bcsr->row_offsets;
4417       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4418       stat = cusparseXcsr2coo(Acusp->handle,
4419                               Aroff->data().get(),
4420                               Annz,
4421                               m,
4422                               Acoo->data().get(),
4423                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4424       stat = cusparseXcsr2coo(Bcusp->handle,
4425                               Broff->data().get(),
4426                               Bnnz,
4427                               m,
4428                               Bcoo->data().get(),
4429                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4430       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4431       auto Aperm = thrust::make_constant_iterator(1);
4432       auto Bperm = thrust::make_constant_iterator(0);
4433 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4434       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4435       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4436 #else
4437       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4438       auto Bcib = Bcsr->column_indices->begin();
4439       auto Bcie = Bcsr->column_indices->end();
4440       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4441 #endif
4442       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4443       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4444       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4445       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4446       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4447       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4448       auto p1 = Ccusp->cooPerm->begin();
4449       auto p2 = Ccusp->cooPerm->begin();
4450       thrust::advance(p2,Annz);
4451       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4452 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4453       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4454 #endif
4455       auto cci = thrust::make_counting_iterator(zero);
4456       auto cce = thrust::make_counting_iterator(c->nz);
4457 #if 0 //Errors on SUMMIT cuda 11.1.0
4458       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4459 #else
4460       auto pred = thrust::identity<int>();
4461       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4462       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4463 #endif
4464       stat = cusparseXcoo2csr(Ccusp->handle,
4465                               Ccoo->data().get(),
4466                               c->nz,
4467                               m,
4468                               Ccsr->row_offsets->data().get(),
4469                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4470       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4471       delete wPerm;
4472       delete Acoo;
4473       delete Bcoo;
4474       delete Ccoo;
4475 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4476       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4477                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4478                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4479                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4480 #endif
4481       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4482         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4483         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4484         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4485         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4486         CsrMatrix *CcsrT = new CsrMatrix;
4487         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4488         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4489 
4490         (*C)->form_explicit_transpose = PETSC_TRUE;
4491         (*C)->transupdated = PETSC_TRUE;
4492         Ccusp->rowoffsets_gpu = NULL;
4493         CmatT->cprowIndices = NULL;
4494         CmatT->mat = CcsrT;
4495         CcsrT->num_rows = n;
4496         CcsrT->num_cols = m;
4497         CcsrT->num_entries = c->nz;
4498 
4499         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4500         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4501         CcsrT->values = new THRUSTARRAY(c->nz);
4502 
4503         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4504         auto rT = CcsrT->row_offsets->begin();
4505         if (AT) {
4506           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4507           thrust::advance(rT,-1);
4508         }
4509         if (BT) {
4510           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4511           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4512           thrust::copy(titb,tite,rT);
4513         }
4514         auto cT = CcsrT->column_indices->begin();
4515         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4516         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4517         auto vT = CcsrT->values->begin();
4518         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4519         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4520         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4521 
4522         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4523         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4524         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4525         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4526         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4527         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4528         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4529         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4530         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4531 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4532         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4533                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4534                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4535                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4536 #endif
4537         Ccusp->matTranspose = CmatT;
4538       }
4539     }
4540 
4541     c->singlemalloc = PETSC_FALSE;
4542     c->free_a       = PETSC_TRUE;
4543     c->free_ij      = PETSC_TRUE;
4544     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4545     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4546     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4547       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4548       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4549       ii   = *Ccsr->row_offsets;
4550       jj   = *Ccsr->column_indices;
4551       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4552       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4553     } else {
4554       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4555       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4556     }
4557     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4558     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4559     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4560     c->maxnz = c->nz;
4561     c->nonzerorowcnt = 0;
4562     c->rmax = 0;
4563     for (i = 0; i < m; i++) {
4564       const PetscInt nn = c->i[i+1] - c->i[i];
4565       c->ilen[i] = c->imax[i] = nn;
4566       c->nonzerorowcnt += (PetscInt)!!nn;
4567       c->rmax = PetscMax(c->rmax,nn);
4568     }
4569     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4570     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4571     (*C)->nonzerostate++;
4572     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4573     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4574     Ccusp->nonzerostate = (*C)->nonzerostate;
4575     (*C)->preallocated  = PETSC_TRUE;
4576   } else {
4577     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4578     c = (Mat_SeqAIJ*)(*C)->data;
4579     if (c->nz) {
4580       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4581       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4582       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4583       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4584       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4585       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4586       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4587       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4588       Acsr = (CsrMatrix*)Acusp->mat->mat;
4589       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4590       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4591       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4592       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4593       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4594       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4595       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4596       auto pmid = Ccusp->cooPerm->begin();
4597       thrust::advance(pmid,Acsr->num_entries);
4598       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4599       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4600                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4601       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4602                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4603       thrust::for_each(zibait,zieait,VecCUDAEquals());
4604       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4605                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4606       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4607                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4608       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4609       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4610       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4611         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4612         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4613         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4614         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4615         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4616         auto vT = CcsrT->values->begin();
4617         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4618         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4619         (*C)->transupdated = PETSC_TRUE;
4620       }
4621       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4622     }
4623   }
4624   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4625   (*C)->assembled     = PETSC_TRUE;
4626   (*C)->was_assembled = PETSC_FALSE;
4627   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4628   PetscFunctionReturn(0);
4629 }
4630 
4631 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4632 {
4633   PetscErrorCode    ierr;
4634   bool              dmem;
4635   const PetscScalar *av;
4636   cudaError_t       cerr;
4637 
4638   PetscFunctionBegin;
4639   dmem = isCudaMem(v);
4640   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4641   if (n && idx) {
4642     THRUSTINTARRAY widx(n);
4643     widx.assign(idx,idx+n);
4644     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4645 
4646     THRUSTARRAY *w = NULL;
4647     thrust::device_ptr<PetscScalar> dv;
4648     if (dmem) {
4649       dv = thrust::device_pointer_cast(v);
4650     } else {
4651       w = new THRUSTARRAY(n);
4652       dv = w->data();
4653     }
4654     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4655 
4656     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4657     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4658     thrust::for_each(zibit,zieit,VecCUDAEquals());
4659     if (w) {
4660       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4661     }
4662     delete w;
4663   } else {
4664     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4665   }
4666   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4667   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4668   PetscFunctionReturn(0);
4669 }
4670