xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4e278199b78715991f5c71ebbd945c1489263e6c)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303 
304     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307 
308     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311    #endif
312   }
313   ierr = PetscOptionsTail();CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
325   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332   PetscErrorCode               ierr;
333 
334   PetscFunctionBegin;
335   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
336   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344   PetscErrorCode               ierr;
345 
346   PetscFunctionBegin;
347   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354 {
355   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356   PetscErrorCode               ierr;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiLo, *AjLo;
375   PetscInt                          i,nz, nzLower, offset, rowOffset;
376   PetscErrorCode                    ierr;
377   cudaError_t                       cerr;
378 
379   PetscFunctionBegin;
380   if (!n) PetscFunctionReturn(0);
381   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
382     try {
383       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
384       nzLower=n+ai[n]-ai[1];
385       if (!loTriFactor) {
386         PetscScalar                       *AALo;
387 
388         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
389 
390         /* Allocate Space for the lower triangular matrix */
391         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
392         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
393 
394         /* Fill the lower triangular matrix */
395         AiLo[0]  = (PetscInt) 0;
396         AiLo[n]  = nzLower;
397         AjLo[0]  = (PetscInt) 0;
398         AALo[0]  = (MatScalar) 1.0;
399         v        = aa;
400         vi       = aj;
401         offset   = 1;
402         rowOffset= 1;
403         for (i=1; i<n; i++) {
404           nz = ai[i+1] - ai[i];
405           /* additional 1 for the term on the diagonal */
406           AiLo[i]    = rowOffset;
407           rowOffset += nz+1;
408 
409           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
411 
412           offset      += nz;
413           AjLo[offset] = (PetscInt) i;
414           AALo[offset] = (MatScalar) 1.0;
415           offset      += 1;
416 
417           v  += nz;
418           vi += nz;
419         }
420 
421         /* allocate space for the triangular factor information */
422         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424         /* Create the matrix description */
425         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
426         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
427        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429        #else
430         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431        #endif
432         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
433         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434 
435         /* set the operation */
436         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438         /* set the matrix */
439         loTriFactor->csrMat = new CsrMatrix;
440         loTriFactor->csrMat->num_rows = n;
441         loTriFactor->csrMat->num_cols = n;
442         loTriFactor->csrMat->num_entries = nzLower;
443 
444         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446 
447         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449 
450         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452 
453         /* Create the solve analysis information */
454         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
456       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463       #endif
464 
465         /* perform the solve analysis */
466         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469                                  loTriFactor->csrMat->column_indices->data().get(),
470                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471                                  loTriFactor->solveInfo,
472                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
473                                #else
474                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
475                                #endif
476         cerr = WaitForCUDA();CHKERRCUDA(cerr);
477         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
478 
479         /* assign the pointer */
480         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
481         loTriFactor->AA_h = AALo;
482         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
483         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
484         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
485       } else { /* update values only */
486         if (!loTriFactor->AA_h) {
487           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
488         }
489         /* Fill the lower triangular matrix */
490         loTriFactor->AA_h[0]  = 1.0;
491         v        = aa;
492         vi       = aj;
493         offset   = 1;
494         for (i=1; i<n; i++) {
495           nz = ai[i+1] - ai[i];
496           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
497           offset      += nz;
498           loTriFactor->AA_h[offset] = 1.0;
499           offset      += 1;
500           v  += nz;
501         }
502         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
503         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
504       }
505     } catch(char *ex) {
506       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
507     }
508   }
509   PetscFunctionReturn(0);
510 }
511 
512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
513 {
514   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
515   PetscInt                          n = A->rmap->n;
516   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
517   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
518   cusparseStatus_t                  stat;
519   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
520   const MatScalar                   *aa = a->a,*v;
521   PetscInt                          *AiUp, *AjUp;
522   PetscInt                          i,nz, nzUpper, offset;
523   PetscErrorCode                    ierr;
524   cudaError_t                       cerr;
525 
526   PetscFunctionBegin;
527   if (!n) PetscFunctionReturn(0);
528   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
529     try {
530       /* next, figure out the number of nonzeros in the upper triangular matrix. */
531       nzUpper = adiag[0]-adiag[n];
532       if (!upTriFactor) {
533         PetscScalar *AAUp;
534 
535         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536 
537         /* Allocate Space for the upper triangular matrix */
538         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
539         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540 
541         /* Fill the upper triangular matrix */
542         AiUp[0]=(PetscInt) 0;
543         AiUp[n]=nzUpper;
544         offset = nzUpper;
545         for (i=n-1; i>=0; i--) {
546           v  = aa + adiag[i+1] + 1;
547           vi = aj + adiag[i+1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i+1]-1;
551 
552           /* decrement the offset */
553           offset -= (nz+1);
554 
555           /* first, set the diagonal elements */
556           AjUp[offset] = (PetscInt) i;
557           AAUp[offset] = (MatScalar)1./v[nz];
558           AiUp[i]      = AiUp[i+1] - (nz+1);
559 
560           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
561           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
562         }
563 
564         /* allocate space for the triangular factor information */
565         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
566         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567 
568         /* Create the matrix description */
569         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
570         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
571        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
573        #else
574         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
575        #endif
576         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
577         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578 
579         /* set the operation */
580         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581 
582         /* set the matrix */
583         upTriFactor->csrMat = new CsrMatrix;
584         upTriFactor->csrMat->num_rows = n;
585         upTriFactor->csrMat->num_cols = n;
586         upTriFactor->csrMat->num_entries = nzUpper;
587 
588         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
589         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590 
591         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
592         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593 
594         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
595         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596 
597         /* Create the solve analysis information */
598         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
599         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
600       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
601         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
602                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
603                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
604                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
605                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
606         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
607       #endif
608 
609         /* perform the solve analysis */
610         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
611                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
612                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
613                                  upTriFactor->csrMat->column_indices->data().get(),
614                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615                                  upTriFactor->solveInfo,
616                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
617                                #else
618                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
619                                #endif
620         cerr = WaitForCUDA();CHKERRCUDA(cerr);
621         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
622 
623         /* assign the pointer */
624         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625         upTriFactor->AA_h = AAUp;
626         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
629       } else {
630         if (!upTriFactor->AA_h) {
631           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
632         }
633         /* Fill the upper triangular matrix */
634         offset = nzUpper;
635         for (i=n-1; i>=0; i--) {
636           v  = aa + adiag[i+1] + 1;
637 
638           /* number of elements NOT on the diagonal */
639           nz = adiag[i] - adiag[i+1]-1;
640 
641           /* decrement the offset */
642           offset -= (nz+1);
643 
644           /* first, set the diagonal elements */
645           upTriFactor->AA_h[offset] = 1./v[nz];
646           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
647         }
648         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
649         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
650       }
651     } catch(char *ex) {
652       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
653     }
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
659 {
660   PetscErrorCode               ierr;
661   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
662   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
663   IS                           isrow = a->row,iscol = a->icol;
664   PetscBool                    row_identity,col_identity;
665   PetscInt                     n = A->rmap->n;
666 
667   PetscFunctionBegin;
668   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
669   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
670   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
671 
672   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
673   cusparseTriFactors->nnz=a->nz;
674 
675   A->offloadmask = PETSC_OFFLOAD_BOTH;
676   /* lower triangular indices */
677   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
678   if (!row_identity && !cusparseTriFactors->rpermIndices) {
679     const PetscInt *r;
680 
681     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
682     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683     cusparseTriFactors->rpermIndices->assign(r, r+n);
684     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
685     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
686   }
687 
688   /* upper triangular indices */
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (!col_identity && !cusparseTriFactors->cpermIndices) {
691     const PetscInt *c;
692 
693     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
694     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
695     cusparseTriFactors->cpermIndices->assign(c, c+n);
696     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
697     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
698   }
699   PetscFunctionReturn(0);
700 }
701 
702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
703 {
704   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
705   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708   cusparseStatus_t                  stat;
709   PetscErrorCode                    ierr;
710   cudaError_t                       cerr;
711   PetscInt                          *AiUp, *AjUp;
712   PetscScalar                       *AAUp;
713   PetscScalar                       *AALo;
714   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
715   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
716   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
717   const MatScalar                   *aa = b->a,*v;
718 
719   PetscFunctionBegin;
720   if (!n) PetscFunctionReturn(0);
721   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
722     try {
723       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
724       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
725       if (!upTriFactor && !loTriFactor) {
726         /* Allocate Space for the upper triangular matrix */
727         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
728         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
729 
730         /* Fill the upper triangular matrix */
731         AiUp[0]=(PetscInt) 0;
732         AiUp[n]=nzUpper;
733         offset = 0;
734         for (i=0; i<n; i++) {
735           /* set the pointers */
736           v  = aa + ai[i];
737           vj = aj + ai[i];
738           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
739 
740           /* first, set the diagonal elements */
741           AjUp[offset] = (PetscInt) i;
742           AAUp[offset] = (MatScalar)1.0/v[nz];
743           AiUp[i]      = offset;
744           AALo[offset] = (MatScalar)1.0/v[nz];
745 
746           offset+=1;
747           if (nz>0) {
748             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
749             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
750             for (j=offset; j<offset+nz; j++) {
751               AAUp[j] = -AAUp[j];
752               AALo[j] = AAUp[j]/v[nz];
753             }
754             offset+=nz;
755           }
756         }
757 
758         /* allocate space for the triangular factor information */
759         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
760         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
761 
762         /* Create the matrix description */
763         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
764         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
765        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
766         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
767        #else
768         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
769        #endif
770         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
771         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
772 
773         /* set the matrix */
774         upTriFactor->csrMat = new CsrMatrix;
775         upTriFactor->csrMat->num_rows = A->rmap->n;
776         upTriFactor->csrMat->num_cols = A->cmap->n;
777         upTriFactor->csrMat->num_entries = a->nz;
778 
779         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
780         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
781 
782         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
783         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
784 
785         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
786         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
787 
788         /* set the operation */
789         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
790 
791         /* Create the solve analysis information */
792         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
793         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
794       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
795         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
796                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
797                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
798                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
799                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
800         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
801       #endif
802 
803         /* perform the solve analysis */
804         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
805                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
806                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
807                                  upTriFactor->csrMat->column_indices->data().get(),
808                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809                                  upTriFactor->solveInfo,
810                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
811                                 #else
812                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
813                                 #endif
814         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816 
817         /* assign the pointer */
818         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the operation */
836         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837 
838         /* set the matrix */
839         loTriFactor->csrMat = new CsrMatrix;
840         loTriFactor->csrMat->num_rows = A->rmap->n;
841         loTriFactor->csrMat->num_cols = A->cmap->n;
842         loTriFactor->csrMat->num_entries = a->nz;
843 
844         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                  loTriFactor->csrMat->column_indices->data().get(),
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  loTriFactor->solveInfo,
872                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873                                 #else
874                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875                                 #endif
876         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878 
879         /* assign the pointer */
880         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
881 
882         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
883         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
884         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
885       } else {
886         /* Fill the upper triangular matrix */
887         offset = 0;
888         for (i=0; i<n; i++) {
889           /* set the pointers */
890           v  = aa + ai[i];
891           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
892 
893           /* first, set the diagonal elements */
894           AAUp[offset] = 1.0/v[nz];
895           AALo[offset] = 1.0/v[nz];
896 
897           offset+=1;
898           if (nz>0) {
899             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
900             for (j=offset; j<offset+nz; j++) {
901               AAUp[j] = -AAUp[j];
902               AALo[j] = AAUp[j]/v[nz];
903             }
904             offset+=nz;
905           }
906         }
907         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
911         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
912       }
913       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
914       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
915     } catch(char *ex) {
916       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
917     }
918   }
919   PetscFunctionReturn(0);
920 }
921 
922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
923 {
924   PetscErrorCode               ierr;
925   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
926   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
927   IS                           ip = a->row;
928   PetscBool                    perm_identity;
929   PetscInt                     n = A->rmap->n;
930 
931   PetscFunctionBegin;
932   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
933   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
934   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
935   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
936 
937   A->offloadmask = PETSC_OFFLOAD_BOTH;
938 
939   /* lower triangular indices */
940   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
941   if (!perm_identity) {
942     IS             iip;
943     const PetscInt *irip,*rip;
944 
945     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
946     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
947     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
948     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
950     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
951     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
952     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
953     ierr = ISDestroy(&iip);CHKERRQ(ierr);
954     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
955     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961 {
962   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963   IS             ip = b->row;
964   PetscBool      perm_identity;
965   PetscErrorCode ierr;
966 
967   PetscFunctionBegin;
968   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1064                         #endif
1065   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067 
1068   /* Create the solve analysis information */
1069   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078 #endif
1079 
1080   /* perform the solve analysis */
1081   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                            loTriFactorT->csrMat->column_indices->data().get(),
1085                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086                            loTriFactorT->solveInfo,
1087                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1088                           #else
1089                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1090                           #endif
1091   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1092   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1093 
1094   /* assign the pointer */
1095   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1096 
1097   /*********************************************/
1098   /* Now the Transpose of the Upper Tri Factor */
1099   /*********************************************/
1100 
1101   /* allocate space for the transpose of the upper triangular factor */
1102   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1103   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1104 
1105   /* set the matrix descriptors of the upper triangular factor */
1106   matrixType = cusparseGetMatType(upTriFactor->descr);
1107   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1108   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1109     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1110   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1111 
1112   /* Create the matrix description */
1113   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1118 
1119   /* set the operation */
1120   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1121 
1122   /* allocate GPU space for the CSC of the upper triangular factor*/
1123   upTriFactorT->csrMat = new CsrMatrix;
1124   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1125   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1126   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1127   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1128   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1129   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1130 
1131   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1133   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1134                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1135                                 upTriFactor->csrMat->values->data().get(),
1136                                 upTriFactor->csrMat->row_offsets->data().get(),
1137                                 upTriFactor->csrMat->column_indices->data().get(),
1138                                 upTriFactorT->csrMat->values->data().get(),
1139                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1141                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1146   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1147                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1148                           upTriFactor->csrMat->values->data().get(),
1149                           upTriFactor->csrMat->row_offsets->data().get(),
1150                           upTriFactor->csrMat->column_indices->data().get(),
1151                           upTriFactorT->csrMat->values->data().get(),
1152                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                           CUSPARSE_ACTION_NUMERIC, indexBase,
1155                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1156                         #else
1157                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1158                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1159                         #endif
1160 
1161   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1162   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1163 
1164   /* Create the solve analysis information */
1165   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1166   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1167   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1172                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1173   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1174   #endif
1175 
1176   /* perform the solve analysis */
1177   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180                            upTriFactorT->csrMat->column_indices->data().get(),
1181                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182                            upTriFactorT->solveInfo,
1183                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1184                           #else
1185                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1186                           #endif
1187 
1188   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1189   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1190 
1191   /* assign the pointer */
1192   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 struct PetscScalarToPetscInt
1197 {
1198   __host__ __device__
1199   PetscInt operator()(PetscScalar s)
1200   {
1201     return (PetscInt)PetscRealPart(s);
1202   }
1203 };
1204 
1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1206 {
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   cusparseStatus_t             stat;
1211   cusparseIndexBase_t          indexBase;
1212   cudaError_t                  err;
1213   PetscErrorCode               ierr;
1214 
1215   PetscFunctionBegin;
1216   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1217   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1218   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1220   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1221   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1222   if (A->transupdated) PetscFunctionReturn(0);
1223   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1226     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1227   }
1228   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1229     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1230     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1231     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1232     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1233     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1234 
1235     /* set alpha and beta */
1236     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1237     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1238     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1240     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242 
1243     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1244       CsrMatrix *matrixT = new CsrMatrix;
1245       matstructT->mat = matrixT;
1246       matrixT->num_rows = A->cmap->n;
1247       matrixT->num_cols = A->rmap->n;
1248       matrixT->num_entries = a->nz;
1249       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1250       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1251       matrixT->values = new THRUSTARRAY(a->nz);
1252 
1253       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1254       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1255 
1256      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257       stat = cusparseCreateCsr(&matstructT->matDescr,
1258                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1259                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1260                                matrixT->values->data().get(),
1261                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1263      #endif
1264     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1265    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1266       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1267    #else
1268       CsrMatrix *temp  = new CsrMatrix;
1269       CsrMatrix *tempT = new CsrMatrix;
1270       /* First convert HYB to CSR */
1271       temp->num_rows = A->rmap->n;
1272       temp->num_cols = A->cmap->n;
1273       temp->num_entries = a->nz;
1274       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1275       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1276       temp->values = new THRUSTARRAY(a->nz);
1277 
1278       stat = cusparse_hyb2csr(cusparsestruct->handle,
1279                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1280                               temp->values->data().get(),
1281                               temp->row_offsets->data().get(),
1282                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1283 
1284       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1285       tempT->num_rows = A->rmap->n;
1286       tempT->num_cols = A->cmap->n;
1287       tempT->num_entries = a->nz;
1288       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1290       tempT->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1293                               temp->num_cols, temp->num_entries,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get(),
1297                               tempT->values->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               tempT->row_offsets->data().get(),
1300                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1301 
1302       /* Last, convert CSC to HYB */
1303       cusparseHybMat_t hybMat;
1304       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1305       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1306         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1307       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1308                               matstructT->descr, tempT->values->data().get(),
1309                               tempT->row_offsets->data().get(),
1310                               tempT->column_indices->data().get(),
1311                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1312 
1313       /* assign the pointer */
1314       matstructT->mat = hybMat;
1315       A->transupdated = PETSC_TRUE;
1316       /* delete temporaries */
1317       if (tempT) {
1318         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1319         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1320         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1321         delete (CsrMatrix*) tempT;
1322       }
1323       if (temp) {
1324         if (temp->values) delete (THRUSTARRAY*) temp->values;
1325         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1326         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1327         delete (CsrMatrix*) temp;
1328       }
1329      #endif
1330     }
1331   }
1332   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1333     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1334     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1335     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1336     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1337     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1338     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1339     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1340     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1341     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1342     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1343     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1344       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1345       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1346       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1347     }
1348     if (!cusparsestruct->csr2csc_i) {
1349       THRUSTARRAY csr2csc_a(matrix->num_entries);
1350       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1351 
1352       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1353      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354       void   *csr2cscBuffer;
1355       size_t csr2cscBufferSize;
1356       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1357                                            A->cmap->n, matrix->num_entries,
1358                                            matrix->values->data().get(),
1359                                            cusparsestruct->rowoffsets_gpu->data().get(),
1360                                            matrix->column_indices->data().get(),
1361                                            matrixT->values->data().get(),
1362                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1363                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1364                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1365       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1366      #endif
1367 
1368       if (matrix->num_entries) {
1369         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1370            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1371            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1372 
1373            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1374            should be filled with indexBase. So I just take a shortcut here.
1375         */
1376         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1377                               A->cmap->n,matrix->num_entries,
1378                               csr2csc_a.data().get(),
1379                               cusparsestruct->rowoffsets_gpu->data().get(),
1380                               matrix->column_indices->data().get(),
1381                               matrixT->values->data().get(),
1382                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1384                               CUSPARSE_ACTION_NUMERIC,indexBase,
1385                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1386                              #else
1387                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1388                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1389                              #endif
1390       } else {
1391         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1392       }
1393 
1394       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1395       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1396      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1398      #endif
1399     }
1400     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1401                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1402                                                      matrixT->values->begin()));
1403   }
1404   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1406   /* the compressed row indices is not used for matTranspose */
1407   matstructT->cprowIndices = NULL;
1408   /* assign the pointer */
1409   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1410   A->transupdated = PETSC_TRUE;
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1416 {
1417   PetscInt                              n = xx->map->n;
1418   const PetscScalar                     *barray;
1419   PetscScalar                           *xarray;
1420   thrust::device_ptr<const PetscScalar> bGPU;
1421   thrust::device_ptr<PetscScalar>       xGPU;
1422   cusparseStatus_t                      stat;
1423   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1424   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1427   PetscErrorCode                        ierr;
1428 
1429   PetscFunctionBegin;
1430   /* Analyze the matrix and create the transpose ... on the fly */
1431   if (!loTriFactorT && !upTriFactorT) {
1432     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1433     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1434     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1435   }
1436 
1437   /* Get the GPU pointers */
1438   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1440   xGPU = thrust::device_pointer_cast(xarray);
1441   bGPU = thrust::device_pointer_cast(barray);
1442 
1443   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1444   /* First, reorder with the row permutation */
1445   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1446                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1447                xGPU);
1448 
1449   /* First, solve U */
1450   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1451                         upTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         upTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1456                         upTriFactorT->csrMat->values->data().get(),
1457                         upTriFactorT->csrMat->row_offsets->data().get(),
1458                         upTriFactorT->csrMat->column_indices->data().get(),
1459                         upTriFactorT->solveInfo,
1460                         xarray,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         tempGPU->data().get(),
1463                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1464                       #else
1465                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1466                       #endif
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(),
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         xarray,
1482                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1483                       #else
1484                          xarray);CHKERRCUSPARSE(stat);
1485                       #endif
1486 
1487   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1488   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1489                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1490                tempGPU->begin());
1491 
1492   /* Copy the temporary to the full solution. */
1493   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1494 
1495   /* restore */
1496   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1497   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1498   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1504 {
1505   const PetscScalar                 *barray;
1506   PetscScalar                       *xarray;
1507   cusparseStatus_t                  stat;
1508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1511   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1512   PetscErrorCode                    ierr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray,
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         tempGPU->data().get(),
1541                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1542                       #else
1543                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1544                       #endif
1545 
1546   /* Then, solve L */
1547   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1548                         loTriFactorT->csrMat->num_rows,
1549                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550                         loTriFactorT->csrMat->num_entries,
1551                       #endif
1552                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1553                         loTriFactorT->csrMat->values->data().get(),
1554                         loTriFactorT->csrMat->row_offsets->data().get(),
1555                         loTriFactorT->csrMat->column_indices->data().get(),
1556                         loTriFactorT->solveInfo,
1557                         tempGPU->data().get(),
1558                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559                         xarray,
1560                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1561                       #else
1562                         xarray);CHKERRCUSPARSE(stat);
1563                       #endif
1564 
1565   /* restore */
1566   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1574 {
1575   const PetscScalar                     *barray;
1576   PetscScalar                           *xarray;
1577   thrust::device_ptr<const PetscScalar> bGPU;
1578   thrust::device_ptr<PetscScalar>       xGPU;
1579   cusparseStatus_t                      stat;
1580   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1581   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1582   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1583   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1584   PetscErrorCode                        ierr;
1585 
1586   PetscFunctionBegin;
1587 
1588   /* Get the GPU pointers */
1589   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1590   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1591   xGPU = thrust::device_pointer_cast(xarray);
1592   bGPU = thrust::device_pointer_cast(barray);
1593 
1594   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1595   /* First, reorder with the row permutation */
1596   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1597                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1598                tempGPU->begin());
1599 
1600   /* Next, solve L */
1601   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1602                         loTriFactor->csrMat->num_rows,
1603                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604                         loTriFactor->csrMat->num_entries,
1605                       #endif
1606                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1607                         loTriFactor->csrMat->values->data().get(),
1608                         loTriFactor->csrMat->row_offsets->data().get(),
1609                         loTriFactor->csrMat->column_indices->data().get(),
1610                         loTriFactor->solveInfo,
1611                         tempGPU->data().get(),
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                          xarray,
1614                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1615                       #else
1616                          xarray);CHKERRCUSPARSE(stat);
1617                       #endif
1618 
1619   /* Then, solve U */
1620   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1621                         upTriFactor->csrMat->num_rows,
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                         upTriFactor->csrMat->num_entries,
1624                       #endif
1625                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1626                         upTriFactor->csrMat->values->data().get(),
1627                         upTriFactor->csrMat->row_offsets->data().get(),
1628                         upTriFactor->csrMat->column_indices->data().get(),
1629                         upTriFactor->solveInfo,xarray,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         tempGPU->data().get(),
1632                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1633                       #else
1634                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1635                       #endif
1636 
1637   /* Last, reorder with the column permutation */
1638   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1639                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1640                xGPU);
1641 
1642   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1644   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1650 {
1651   const PetscScalar                 *barray;
1652   PetscScalar                       *xarray;
1653   cusparseStatus_t                  stat;
1654   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1655   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1656   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1657   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1658   PetscErrorCode                    ierr;
1659 
1660   PetscFunctionBegin;
1661   /* Get the GPU pointers */
1662   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1663   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1664 
1665   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1666   /* First, solve L */
1667   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1668                         loTriFactor->csrMat->num_rows,
1669                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670                         loTriFactor->csrMat->num_entries,
1671                       #endif
1672                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1673                         loTriFactor->csrMat->values->data().get(),
1674                         loTriFactor->csrMat->row_offsets->data().get(),
1675                         loTriFactor->csrMat->column_indices->data().get(),
1676                         loTriFactor->solveInfo,
1677                         barray,
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         tempGPU->data().get(),
1680                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1681                       #else
1682                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1683                       #endif
1684 
1685   /* Next, solve U */
1686   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687                         upTriFactor->csrMat->num_rows,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         upTriFactor->csrMat->num_entries,
1690                       #endif
1691                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692                         upTriFactor->csrMat->values->data().get(),
1693                         upTriFactor->csrMat->row_offsets->data().get(),
1694                         upTriFactor->csrMat->column_indices->data().get(),
1695                         upTriFactor->solveInfo,
1696                         tempGPU->data().get(),
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         xarray,
1699                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1700                       #else
1701                         xarray);CHKERRCUSPARSE(stat);
1702                       #endif
1703 
1704   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1705   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1706   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1707   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1712 {
1713   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1714   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1715   cudaError_t        cerr;
1716   PetscErrorCode     ierr;
1717 
1718   PetscFunctionBegin;
1719   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1720     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1721 
1722     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1723     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1724     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1725     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1726     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1727     A->offloadmask = PETSC_OFFLOAD_BOTH;
1728   }
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1733 {
1734   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1735   PetscErrorCode ierr;
1736 
1737   PetscFunctionBegin;
1738   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1739   *array = a->a;
1740   A->offloadmask = PETSC_OFFLOAD_CPU;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1745 {
1746   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1747   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1748   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1749   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1750   PetscErrorCode               ierr;
1751   cusparseStatus_t             stat;
1752   PetscBool                    both = PETSC_TRUE;
1753   cudaError_t                  err;
1754 
1755   PetscFunctionBegin;
1756   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1757   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1758     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1759       CsrMatrix *matrix;
1760       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1761 
1762       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1763       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1764       matrix->values->assign(a->a, a->a+a->nz);
1765       err  = WaitForCUDA();CHKERRCUDA(err);
1766       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1767       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1768       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1769     } else {
1770       PetscInt nnz;
1771       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1772       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1773       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1774       delete cusparsestruct->workVector;
1775       delete cusparsestruct->rowoffsets_gpu;
1776       cusparsestruct->workVector = NULL;
1777       cusparsestruct->rowoffsets_gpu = NULL;
1778       try {
1779         if (a->compressedrow.use) {
1780           m    = a->compressedrow.nrows;
1781           ii   = a->compressedrow.i;
1782           ridx = a->compressedrow.rindex;
1783         } else {
1784           m    = A->rmap->n;
1785           ii   = a->i;
1786           ridx = NULL;
1787         }
1788         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1789         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1790         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1791         else nnz = a->nz;
1792 
1793         /* create cusparse matrix */
1794         cusparsestruct->nrows = m;
1795         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1796         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1797         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1798         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1799 
1800         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1801         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1802         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1803         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1804         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1805         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1806         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1807 
1808         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1809         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1810           /* set the matrix */
1811           CsrMatrix *mat= new CsrMatrix;
1812           mat->num_rows = m;
1813           mat->num_cols = A->cmap->n;
1814           mat->num_entries = nnz;
1815           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1816           mat->row_offsets->assign(ii, ii + m+1);
1817 
1818           mat->column_indices = new THRUSTINTARRAY32(nnz);
1819           mat->column_indices->assign(a->j, a->j+nnz);
1820 
1821           mat->values = new THRUSTARRAY(nnz);
1822           if (a->a) mat->values->assign(a->a, a->a+nnz);
1823 
1824           /* assign the pointer */
1825           matstruct->mat = mat;
1826          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1827           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1828             stat = cusparseCreateCsr(&matstruct->matDescr,
1829                                     mat->num_rows, mat->num_cols, mat->num_entries,
1830                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1831                                     mat->values->data().get(),
1832                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1833                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1834           }
1835          #endif
1836         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1837          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1838           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1839          #else
1840           CsrMatrix *mat= new CsrMatrix;
1841           mat->num_rows = m;
1842           mat->num_cols = A->cmap->n;
1843           mat->num_entries = nnz;
1844           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1845           mat->row_offsets->assign(ii, ii + m+1);
1846 
1847           mat->column_indices = new THRUSTINTARRAY32(nnz);
1848           mat->column_indices->assign(a->j, a->j+nnz);
1849 
1850           mat->values = new THRUSTARRAY(nnz);
1851           if (a->a) mat->values->assign(a->a, a->a+nnz);
1852 
1853           cusparseHybMat_t hybMat;
1854           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1855           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1856             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1857           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1858               matstruct->descr, mat->values->data().get(),
1859               mat->row_offsets->data().get(),
1860               mat->column_indices->data().get(),
1861               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1862           /* assign the pointer */
1863           matstruct->mat = hybMat;
1864 
1865           if (mat) {
1866             if (mat->values) delete (THRUSTARRAY*)mat->values;
1867             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1868             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1869             delete (CsrMatrix*)mat;
1870           }
1871          #endif
1872         }
1873 
1874         /* assign the compressed row indices */
1875         if (a->compressedrow.use) {
1876           cusparsestruct->workVector = new THRUSTARRAY(m);
1877           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1878           matstruct->cprowIndices->assign(ridx,ridx+m);
1879           tmp = m;
1880         } else {
1881           cusparsestruct->workVector = NULL;
1882           matstruct->cprowIndices    = NULL;
1883           tmp = 0;
1884         }
1885         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1886 
1887         /* assign the pointer */
1888         cusparsestruct->mat = matstruct;
1889       } catch(char *ex) {
1890         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1891       }
1892       err  = WaitForCUDA();CHKERRCUDA(err);
1893       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1894       cusparsestruct->nonzerostate = A->nonzerostate;
1895     }
1896     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1897   }
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 struct VecCUDAPlusEquals
1902 {
1903   template <typename Tuple>
1904   __host__ __device__
1905   void operator()(Tuple t)
1906   {
1907     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1908   }
1909 };
1910 
1911 struct VecCUDAEquals
1912 {
1913   template <typename Tuple>
1914   __host__ __device__
1915   void operator()(Tuple t)
1916   {
1917     thrust::get<1>(t) = thrust::get<0>(t);
1918   }
1919 };
1920 
1921 struct VecCUDAEqualsReverse
1922 {
1923   template <typename Tuple>
1924   __host__ __device__
1925   void operator()(Tuple t)
1926   {
1927     thrust::get<0>(t) = thrust::get<1>(t);
1928   }
1929 };
1930 
1931 struct MatMatCusparse {
1932   PetscBool             cisdense;
1933   PetscScalar           *Bt;
1934   Mat                   X;
1935   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1936   PetscLogDouble        flops;
1937   CsrMatrix             *Bcsr;
1938 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1939   cusparseSpMatDescr_t  matSpBDescr;
1940   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1941   cusparseDnMatDescr_t  matBDescr;
1942   cusparseDnMatDescr_t  matCDescr;
1943   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1944   size_t                mmBufferSize;
1945   void                  *mmBuffer;
1946   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1947   cusparseSpGEMMDescr_t spgemmDesc;
1948 #endif
1949 };
1950 
1951 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1952 {
1953   PetscErrorCode   ierr;
1954   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1955   cudaError_t      cerr;
1956  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1957   cusparseStatus_t stat;
1958  #endif
1959 
1960   PetscFunctionBegin;
1961   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1962   delete mmdata->Bcsr;
1963  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1965   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1966   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1967   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1968   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1969   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1970  #endif
1971   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1972   ierr = PetscFree(data);CHKERRQ(ierr);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1977 
1978 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1979 {
1980   Mat_Product                  *product = C->product;
1981   Mat                          A,B;
1982   PetscInt                     m,n,blda,clda;
1983   PetscBool                    flg,biscuda;
1984   Mat_SeqAIJCUSPARSE           *cusp;
1985   cusparseStatus_t             stat;
1986   cusparseOperation_t          opA;
1987   const PetscScalar            *barray;
1988   PetscScalar                  *carray;
1989   PetscErrorCode               ierr;
1990   MatMatCusparse               *mmdata;
1991   Mat_SeqAIJCUSPARSEMultStruct *mat;
1992   CsrMatrix                    *csrmat;
1993 
1994   PetscFunctionBegin;
1995   MatCheckProduct(C,1);
1996   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1997   mmdata = (MatMatCusparse*)product->data;
1998   A    = product->A;
1999   B    = product->B;
2000   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2001   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2002   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2003      Instead of silently accepting the wrong answer, I prefer to raise the error */
2004   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2005   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2006   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2007   switch (product->type) {
2008   case MATPRODUCT_AB:
2009   case MATPRODUCT_PtAP:
2010     mat = cusp->mat;
2011     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012     m   = A->rmap->n;
2013     n   = B->cmap->n;
2014     break;
2015   case MATPRODUCT_AtB:
2016     if (!A->form_explicit_transpose) {
2017       mat = cusp->mat;
2018       opA = CUSPARSE_OPERATION_TRANSPOSE;
2019     } else {
2020       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2021       mat  = cusp->matTranspose;
2022       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2023     }
2024     m = A->cmap->n;
2025     n = B->cmap->n;
2026     break;
2027   case MATPRODUCT_ABt:
2028   case MATPRODUCT_RARt:
2029     mat = cusp->mat;
2030     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2031     m   = A->rmap->n;
2032     n   = B->rmap->n;
2033     break;
2034   default:
2035     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2036   }
2037   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2038   csrmat = (CsrMatrix*)mat->mat;
2039   /* if the user passed a CPU matrix, copy the data to the GPU */
2040   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2041   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2042   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2043 
2044   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2045   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2046     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2047     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2048   } else {
2049     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2050     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2051   }
2052 
2053   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2054  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2055   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2056   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2057   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2058     size_t mmBufferSize;
2059     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2060     if (!mmdata->matBDescr) {
2061       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2062       mmdata->Blda = blda;
2063     }
2064 
2065     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2066     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2067       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2068       mmdata->Clda = clda;
2069     }
2070 
2071     if (!mat->matDescr) {
2072       stat = cusparseCreateCsr(&mat->matDescr,
2073                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2074                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2075                                csrmat->values->data().get(),
2076                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2077                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2078     }
2079     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2080                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2081                                    mmdata->matCDescr,cusparse_scalartype,
2082                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2083     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2084       cudaError_t cerr;
2085       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2086       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2087       mmdata->mmBufferSize = mmBufferSize;
2088     }
2089     mmdata->initialized = PETSC_TRUE;
2090   } else {
2091     /* to be safe, always update pointers of the mats */
2092     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2093     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2094     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2095   }
2096 
2097   /* do cusparseSpMM, which supports transpose on B */
2098   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2099                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2100                       mmdata->matCDescr,cusparse_scalartype,
2101                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2102  #else
2103   PetscInt k;
2104   /* cusparseXcsrmm does not support transpose on B */
2105   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2106     cublasHandle_t cublasv2handle;
2107     cublasStatus_t cerr;
2108 
2109     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2110     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2111                        B->cmap->n,B->rmap->n,
2112                        &PETSC_CUSPARSE_ONE ,barray,blda,
2113                        &PETSC_CUSPARSE_ZERO,barray,blda,
2114                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2115     blda = B->cmap->n;
2116     k    = B->cmap->n;
2117   } else {
2118     k    = B->rmap->n;
2119   }
2120 
2121   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2122   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2123                            csrmat->num_entries,mat->alpha_one,mat->descr,
2124                            csrmat->values->data().get(),
2125                            csrmat->row_offsets->data().get(),
2126                            csrmat->column_indices->data().get(),
2127                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2128                            carray,clda);CHKERRCUSPARSE(stat);
2129  #endif
2130   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2131   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2132   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2133   if (product->type == MATPRODUCT_RARt) {
2134     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2135     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2136   } else if (product->type == MATPRODUCT_PtAP) {
2137     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2138     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2139   } else {
2140     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2141   }
2142   if (mmdata->cisdense) {
2143     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2144   }
2145   if (!biscuda) {
2146     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2147   }
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2152 {
2153   Mat_Product        *product = C->product;
2154   Mat                A,B;
2155   PetscInt           m,n;
2156   PetscBool          cisdense,flg;
2157   PetscErrorCode     ierr;
2158   MatMatCusparse     *mmdata;
2159   Mat_SeqAIJCUSPARSE *cusp;
2160 
2161   PetscFunctionBegin;
2162   MatCheckProduct(C,1);
2163   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2164   A    = product->A;
2165   B    = product->B;
2166   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2167   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2168   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2169   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2170   switch (product->type) {
2171   case MATPRODUCT_AB:
2172     m = A->rmap->n;
2173     n = B->cmap->n;
2174     break;
2175   case MATPRODUCT_AtB:
2176     m = A->cmap->n;
2177     n = B->cmap->n;
2178     break;
2179   case MATPRODUCT_ABt:
2180     m = A->rmap->n;
2181     n = B->rmap->n;
2182     break;
2183   case MATPRODUCT_PtAP:
2184     m = B->cmap->n;
2185     n = B->cmap->n;
2186     break;
2187   case MATPRODUCT_RARt:
2188     m = B->rmap->n;
2189     n = B->rmap->n;
2190     break;
2191   default:
2192     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2193   }
2194   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2195   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2196   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2197   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2198 
2199   /* product data */
2200   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2201   mmdata->cisdense = cisdense;
2202  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2203   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2204   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2205     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2206   }
2207  #endif
2208   /* for these products we need intermediate storage */
2209   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2210     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2211     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2212     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2213       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2214     } else {
2215       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2216     }
2217   }
2218   C->product->data    = mmdata;
2219   C->product->destroy = MatDestroy_MatMatCusparse;
2220 
2221   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2222   PetscFunctionReturn(0);
2223 }
2224 
2225 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2226 {
2227   Mat_Product                  *product = C->product;
2228   Mat                          A,B;
2229   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2230   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2231   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2232   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2233   PetscBool                    flg;
2234   PetscErrorCode               ierr;
2235   cusparseStatus_t             stat;
2236   cudaError_t                  cerr;
2237   MatProductType               ptype;
2238   MatMatCusparse               *mmdata;
2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2240   cusparseSpMatDescr_t         BmatSpDescr;
2241 #endif
2242 
2243   PetscFunctionBegin;
2244   MatCheckProduct(C,1);
2245   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2246   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2248   mmdata = (MatMatCusparse*)C->product->data;
2249   A = product->A;
2250   B = product->B;
2251   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2252     mmdata->reusesym = PETSC_FALSE;
2253     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2254     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2255     Cmat = Ccusp->mat;
2256     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2257     Ccsr = (CsrMatrix*)Cmat->mat;
2258     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2259     goto finalize;
2260   }
2261   if (!c->nz) goto finalize;
2262   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2263   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2264   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2265   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2266   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2267   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2268   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2269   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2270   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2271   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2272   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2273   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2274   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2275   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2276 
2277   ptype = product->type;
2278   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2279   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2280   switch (ptype) {
2281   case MATPRODUCT_AB:
2282     Amat = Acusp->mat;
2283     Bmat = Bcusp->mat;
2284     break;
2285   case MATPRODUCT_AtB:
2286     Amat = Acusp->matTranspose;
2287     Bmat = Bcusp->mat;
2288     break;
2289   case MATPRODUCT_ABt:
2290     Amat = Acusp->mat;
2291     Bmat = Bcusp->matTranspose;
2292     break;
2293   default:
2294     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2295   }
2296   Cmat = Ccusp->mat;
2297   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2298   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2299   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2300   Acsr = (CsrMatrix*)Amat->mat;
2301   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2302   Ccsr = (CsrMatrix*)Cmat->mat;
2303   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2304   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2305   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2308   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2309   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2310                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2311                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2312                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2313   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2314                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2315                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2316 #else
2317   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2318                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2319                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2320                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2321                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2322 #endif
2323   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2324   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2325   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2326   C->offloadmask = PETSC_OFFLOAD_GPU;
2327 finalize:
2328   /* shorter version of MatAssemblyEnd_SeqAIJ */
2329   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2330   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2331   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2332   c->reallocs         = 0;
2333   C->info.mallocs    += 0;
2334   C->info.nz_unneeded = 0;
2335   C->assembled = C->was_assembled = PETSC_TRUE;
2336   C->num_ass++;
2337   PetscFunctionReturn(0);
2338 }
2339 
2340 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2341 {
2342   Mat_Product                  *product = C->product;
2343   Mat                          A,B;
2344   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2345   Mat_SeqAIJ                   *a,*b,*c;
2346   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2347   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2348   PetscInt                     i,j,m,n,k;
2349   PetscBool                    flg;
2350   PetscErrorCode               ierr;
2351   cusparseStatus_t             stat;
2352   cudaError_t                  cerr;
2353   MatProductType               ptype;
2354   MatMatCusparse               *mmdata;
2355   PetscLogDouble               flops;
2356   PetscBool                    biscompressed,ciscompressed;
2357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2358   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2359   size_t                       bufSize2;
2360   cusparseSpMatDescr_t         BmatSpDescr;
2361 #else
2362   int                          cnz;
2363 #endif
2364 
2365   PetscFunctionBegin;
2366   MatCheckProduct(C,1);
2367   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2368   A    = product->A;
2369   B    = product->B;
2370   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2371   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2372   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2373   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2374   a = (Mat_SeqAIJ*)A->data;
2375   b = (Mat_SeqAIJ*)B->data;
2376   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2377   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2378   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2379   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2380 
2381   /* product data */
2382   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2383   C->product->data    = mmdata;
2384   C->product->destroy = MatDestroy_MatMatCusparse;
2385 
2386   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2387   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2388   ptype = product->type;
2389   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2390   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2391   biscompressed = PETSC_FALSE;
2392   ciscompressed = PETSC_FALSE;
2393   switch (ptype) {
2394   case MATPRODUCT_AB:
2395     m = A->rmap->n;
2396     n = B->cmap->n;
2397     k = A->cmap->n;
2398     Amat = Acusp->mat;
2399     Bmat = Bcusp->mat;
2400     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2402     break;
2403   case MATPRODUCT_AtB:
2404     m = A->cmap->n;
2405     n = B->cmap->n;
2406     k = A->rmap->n;
2407     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2408     Amat = Acusp->matTranspose;
2409     Bmat = Bcusp->mat;
2410     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2411     break;
2412   case MATPRODUCT_ABt:
2413     m = A->rmap->n;
2414     n = B->rmap->n;
2415     k = A->cmap->n;
2416     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2417     Amat = Acusp->mat;
2418     Bmat = Bcusp->matTranspose;
2419     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2420     break;
2421   default:
2422     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2423   }
2424 
2425   /* create cusparse matrix */
2426   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2427   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2428   c     = (Mat_SeqAIJ*)C->data;
2429   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2430   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2431   Ccsr  = new CsrMatrix;
2432 
2433   c->compressedrow.use = ciscompressed;
2434   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2435     c->compressedrow.nrows = a->compressedrow.nrows;
2436     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2437     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2438     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2439     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2440     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2441   } else {
2442     c->compressedrow.nrows  = 0;
2443     c->compressedrow.i      = NULL;
2444     c->compressedrow.rindex = NULL;
2445     Ccusp->workVector       = NULL;
2446     Cmat->cprowIndices      = NULL;
2447   }
2448   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2449   Ccusp->mat      = Cmat;
2450   Ccusp->mat->mat = Ccsr;
2451   Ccsr->num_rows    = Ccusp->nrows;
2452   Ccsr->num_cols    = n;
2453   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2454   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2455   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2456   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2457   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2458   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2459   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2460   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2461   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2462   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2463   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2464     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2465     c->nz = 0;
2466     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2467     Ccsr->values = new THRUSTARRAY(c->nz);
2468     goto finalizesym;
2469   }
2470 
2471   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2472   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2473   Acsr = (CsrMatrix*)Amat->mat;
2474   if (!biscompressed) {
2475     Bcsr = (CsrMatrix*)Bmat->mat;
2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2477     BmatSpDescr = Bmat->matDescr;
2478 #endif
2479   } else { /* we need to use row offsets for the full matrix */
2480     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2481     Bcsr = new CsrMatrix;
2482     Bcsr->num_rows       = B->rmap->n;
2483     Bcsr->num_cols       = cBcsr->num_cols;
2484     Bcsr->num_entries    = cBcsr->num_entries;
2485     Bcsr->column_indices = cBcsr->column_indices;
2486     Bcsr->values         = cBcsr->values;
2487     if (!Bcusp->rowoffsets_gpu) {
2488       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2489       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2490       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2491     }
2492     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2493     mmdata->Bcsr = Bcsr;
2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495     if (Bcsr->num_rows && Bcsr->num_cols) {
2496       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2497                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2498                                Bcsr->values->data().get(),
2499                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2500                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2501     }
2502     BmatSpDescr = mmdata->matSpBDescr;
2503 #endif
2504   }
2505   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2506   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2507   /* precompute flops count */
2508   if (ptype == MATPRODUCT_AB) {
2509     for (i=0, flops = 0; i<A->rmap->n; i++) {
2510       const PetscInt st = a->i[i];
2511       const PetscInt en = a->i[i+1];
2512       for (j=st; j<en; j++) {
2513         const PetscInt brow = a->j[j];
2514         flops += 2.*(b->i[brow+1] - b->i[brow]);
2515       }
2516     }
2517   } else if (ptype == MATPRODUCT_AtB) {
2518     for (i=0, flops = 0; i<A->rmap->n; i++) {
2519       const PetscInt anzi = a->i[i+1] - a->i[i];
2520       const PetscInt bnzi = b->i[i+1] - b->i[i];
2521       flops += (2.*anzi)*bnzi;
2522     }
2523   } else { /* TODO */
2524     flops = 0.;
2525   }
2526 
2527   mmdata->flops = flops;
2528   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2530   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2531   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2532                            NULL, NULL, NULL,
2533                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2534                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2535   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2536   /* ask bufferSize bytes for external memory */
2537   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2538                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2539                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2540                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2541   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2542   /* inspect the matrices A and B to understand the memory requirement for the next step */
2543   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2547   /* ask bufferSize again bytes for external memory */
2548   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2549                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2550                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2551                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2552   /* The CUSPARSE documentation is not clear, nor the API
2553      We need both buffers to perform the operations properly!
2554      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2555      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2556      is stored in the descriptor! What a messy API... */
2557   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2558   /* compute the intermediate product of A * B */
2559   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2561                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2562                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2563   /* get matrix C non-zero entries C_nnz1 */
2564   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2565   c->nz = (PetscInt) C_nnz1;
2566   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2567   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2568   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2569   Ccsr->values = new THRUSTARRAY(c->nz);
2570   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2572                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2573   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2575                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2576 #else
2577   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2578   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2579                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2580                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2581                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2582                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2583   c->nz = cnz;
2584   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2585   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2586   Ccsr->values = new THRUSTARRAY(c->nz);
2587   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2588 
2589   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2590   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2591      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2592      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2593   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2594                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2595                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2596                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2597                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2598 #endif
2599   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2600   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2601 finalizesym:
2602   c->singlemalloc = PETSC_FALSE;
2603   c->free_a       = PETSC_TRUE;
2604   c->free_ij      = PETSC_TRUE;
2605   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2606   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2607   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2608     PetscInt *d_i = c->i;
2609     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2610     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2611     ii   = *Ccsr->row_offsets;
2612     jj   = *Ccsr->column_indices;
2613     if (ciscompressed) d_i = c->compressedrow.i;
2614     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2615     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616   } else {
2617     PetscInt *d_i = c->i;
2618     if (ciscompressed) d_i = c->compressedrow.i;
2619     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2620     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2621   }
2622   if (ciscompressed) { /* need to expand host row offsets */
2623     PetscInt r = 0;
2624     c->i[0] = 0;
2625     for (k = 0; k < c->compressedrow.nrows; k++) {
2626       const PetscInt next = c->compressedrow.rindex[k];
2627       const PetscInt old = c->compressedrow.i[k];
2628       for (; r < next; r++) c->i[r+1] = old;
2629     }
2630     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2631   }
2632   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2633   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2634   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2635   c->maxnz = c->nz;
2636   c->nonzerorowcnt = 0;
2637   c->rmax = 0;
2638   for (k = 0; k < m; k++) {
2639     const PetscInt nn = c->i[k+1] - c->i[k];
2640     c->ilen[k] = c->imax[k] = nn;
2641     c->nonzerorowcnt += (PetscInt)!!nn;
2642     c->rmax = PetscMax(c->rmax,nn);
2643   }
2644   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2645   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2646   Ccsr->num_entries = c->nz;
2647 
2648   C->nonzerostate++;
2649   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2650   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2651   Ccusp->nonzerostate = C->nonzerostate;
2652   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2653   C->preallocated  = PETSC_TRUE;
2654   C->assembled     = PETSC_FALSE;
2655   C->was_assembled = PETSC_FALSE;
2656   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2657     mmdata->reusesym = PETSC_TRUE;
2658     C->offloadmask   = PETSC_OFFLOAD_GPU;
2659   }
2660   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2661   PetscFunctionReturn(0);
2662 }
2663 
2664 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2665 
2666 /* handles sparse or dense B */
2667 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2668 {
2669   Mat_Product    *product = mat->product;
2670   PetscErrorCode ierr;
2671   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2672 
2673   PetscFunctionBegin;
2674   MatCheckProduct(mat,1);
2675   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2676   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2677     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2678   }
2679   if (product->type == MATPRODUCT_ABC) {
2680     Ciscusp = PETSC_FALSE;
2681     if (!product->C->boundtocpu) {
2682       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2683     }
2684   }
2685   if (isdense) {
2686     switch (product->type) {
2687     case MATPRODUCT_AB:
2688     case MATPRODUCT_AtB:
2689     case MATPRODUCT_ABt:
2690     case MATPRODUCT_PtAP:
2691     case MATPRODUCT_RARt:
2692      if (product->A->boundtocpu) {
2693         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2694       } else {
2695         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2696       }
2697       break;
2698     case MATPRODUCT_ABC:
2699       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2700       break;
2701     default:
2702       break;
2703     }
2704   } else if (Biscusp && Ciscusp) {
2705     switch (product->type) {
2706     case MATPRODUCT_AB:
2707     case MATPRODUCT_AtB:
2708     case MATPRODUCT_ABt:
2709       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2710       break;
2711     case MATPRODUCT_PtAP:
2712     case MATPRODUCT_RARt:
2713     case MATPRODUCT_ABC:
2714       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2715       break;
2716     default:
2717       break;
2718     }
2719   } else { /* fallback for AIJ */
2720     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2721   }
2722   PetscFunctionReturn(0);
2723 }
2724 
2725 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2726 {
2727   PetscErrorCode ierr;
2728 
2729   PetscFunctionBegin;
2730   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2731   PetscFunctionReturn(0);
2732 }
2733 
2734 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2735 {
2736   PetscErrorCode ierr;
2737 
2738   PetscFunctionBegin;
2739   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2740   PetscFunctionReturn(0);
2741 }
2742 
2743 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2744 {
2745   PetscErrorCode ierr;
2746 
2747   PetscFunctionBegin;
2748   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2749   PetscFunctionReturn(0);
2750 }
2751 
2752 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2753 {
2754   PetscErrorCode ierr;
2755 
2756   PetscFunctionBegin;
2757   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2758   PetscFunctionReturn(0);
2759 }
2760 
2761 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2762 {
2763   PetscErrorCode ierr;
2764 
2765   PetscFunctionBegin;
2766   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2767   PetscFunctionReturn(0);
2768 }
2769 
2770 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2771 {
2772   int i = blockIdx.x*blockDim.x + threadIdx.x;
2773   if (i < n) y[idx[i]] += x[i];
2774 }
2775 
2776 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2777 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2778 {
2779   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2780   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2781   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2782   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2783   PetscErrorCode               ierr;
2784   cusparseStatus_t             stat;
2785   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2786   PetscBool                    compressed;
2787 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2788   PetscInt                     nx,ny;
2789 #endif
2790 
2791   PetscFunctionBegin;
2792   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2793   if (!a->nonzerorowcnt) {
2794     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2795     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2796     PetscFunctionReturn(0);
2797   }
2798   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2799   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2800   if (!trans) {
2801     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2802     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2803   } else {
2804     if (herm || !A->form_explicit_transpose) {
2805       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2806       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2807     } else {
2808       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2809       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2810     }
2811   }
2812   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2813   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2814 
2815   try {
2816     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2817     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2818     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2819 
2820     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2821     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2822       /* z = A x + beta y.
2823          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2824          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2825       */
2826       xptr = xarray;
2827       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2828       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2829      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2830       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2831           allocated to accommodate different uses. So we get the length info directly from mat.
2832        */
2833       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2834         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2835         nx = mat->num_cols;
2836         ny = mat->num_rows;
2837       }
2838      #endif
2839     } else {
2840       /* z = A^T x + beta y
2841          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2842          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2843        */
2844       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2845       dptr = zarray;
2846       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2847       if (compressed) { /* Scatter x to work vector */
2848         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2849         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2850                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2851                          VecCUDAEqualsReverse());
2852       }
2853      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2854       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2855         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2856         nx = mat->num_rows;
2857         ny = mat->num_cols;
2858       }
2859      #endif
2860     }
2861 
2862     /* csr_spmv does y = alpha op(A) x + beta y */
2863     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2864      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2865       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2866       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2867         cudaError_t cerr;
2868         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2869         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2870         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2871                                 matstruct->matDescr,
2872                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2873                                 matstruct->cuSpMV[opA].vecYDescr,
2874                                 cusparse_scalartype,
2875                                 cusparsestruct->spmvAlg,
2876                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2877         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2878 
2879         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2880       } else {
2881         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2882         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2883         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2884       }
2885 
2886       stat = cusparseSpMV(cusparsestruct->handle, opA,
2887                                matstruct->alpha_one,
2888                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2889                                matstruct->cuSpMV[opA].vecXDescr,
2890                                beta,
2891                                matstruct->cuSpMV[opA].vecYDescr,
2892                                cusparse_scalartype,
2893                                cusparsestruct->spmvAlg,
2894                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2895      #else
2896       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2897       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2898                                mat->num_rows, mat->num_cols,
2899                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2900                                mat->values->data().get(), mat->row_offsets->data().get(),
2901                                mat->column_indices->data().get(), xptr, beta,
2902                                dptr);CHKERRCUSPARSE(stat);
2903      #endif
2904     } else {
2905       if (cusparsestruct->nrows) {
2906        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2907         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2908        #else
2909         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2910         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2911                                  matstruct->alpha_one, matstruct->descr, hybMat,
2912                                  xptr, beta,
2913                                  dptr);CHKERRCUSPARSE(stat);
2914        #endif
2915       }
2916     }
2917     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2918 
2919     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2920       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2921         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2922           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2923         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2924           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2925         }
2926       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2927         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2928       }
2929 
2930       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2931       if (compressed) {
2932         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2933         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2934            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2935            prevent that. So I just add a ScatterAdd kernel.
2936          */
2937        #if 0
2938         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2939         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2940                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2941                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2942                          VecCUDAPlusEquals());
2943        #else
2944         PetscInt n = matstruct->cprowIndices->size();
2945         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2946        #endif
2947         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2948       }
2949     } else {
2950       if (yy && yy != zz) {
2951         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2952       }
2953     }
2954     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2955     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2956     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2957   } catch(char *ex) {
2958     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2959   }
2960   if (yy) {
2961     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2962   } else {
2963     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2964   }
2965   PetscFunctionReturn(0);
2966 }
2967 
2968 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2969 {
2970   PetscErrorCode ierr;
2971 
2972   PetscFunctionBegin;
2973   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2974   PetscFunctionReturn(0);
2975 }
2976 
2977 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2978 {
2979   PetscErrorCode              ierr;
2980   PetscSplitCSRDataStructure  *d_mat = NULL;
2981   PetscFunctionBegin;
2982   if (A->factortype == MAT_FACTOR_NONE) {
2983     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2984   }
2985   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2986   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2987   if (d_mat) {
2988     A->offloadmask = PETSC_OFFLOAD_GPU;
2989   }
2990 
2991   PetscFunctionReturn(0);
2992 }
2993 
2994 /* --------------------------------------------------------------------------------*/
2995 /*@
2996    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2997    (the default parallel PETSc format). This matrix will ultimately pushed down
2998    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2999    assembly performance the user should preallocate the matrix storage by setting
3000    the parameter nz (or the array nnz).  By setting these parameters accurately,
3001    performance during matrix assembly can be increased by more than a factor of 50.
3002 
3003    Collective
3004 
3005    Input Parameters:
3006 +  comm - MPI communicator, set to PETSC_COMM_SELF
3007 .  m - number of rows
3008 .  n - number of columns
3009 .  nz - number of nonzeros per row (same for all rows)
3010 -  nnz - array containing the number of nonzeros in the various rows
3011          (possibly different for each row) or NULL
3012 
3013    Output Parameter:
3014 .  A - the matrix
3015 
3016    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3017    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3018    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3019 
3020    Notes:
3021    If nnz is given then nz is ignored
3022 
3023    The AIJ format (also called the Yale sparse matrix format or
3024    compressed row storage), is fully compatible with standard Fortran 77
3025    storage.  That is, the stored row and column indices can begin at
3026    either one (as in Fortran) or zero.  See the users' manual for details.
3027 
3028    Specify the preallocated storage with either nz or nnz (not both).
3029    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3030    allocation.  For large problems you MUST preallocate memory or you
3031    will get TERRIBLE performance, see the users' manual chapter on matrices.
3032 
3033    By default, this format uses inodes (identical nodes) when possible, to
3034    improve numerical efficiency of matrix-vector products and solves. We
3035    search for consecutive rows with the same nonzero structure, thereby
3036    reusing matrix information to achieve increased efficiency.
3037 
3038    Level: intermediate
3039 
3040 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3041 @*/
3042 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3043 {
3044   PetscErrorCode ierr;
3045 
3046   PetscFunctionBegin;
3047   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3048   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3049   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3050   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3051   PetscFunctionReturn(0);
3052 }
3053 
3054 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3055 {
3056   PetscErrorCode              ierr;
3057   PetscSplitCSRDataStructure  *d_mat = NULL;
3058 
3059   PetscFunctionBegin;
3060   if (A->factortype == MAT_FACTOR_NONE) {
3061     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3062     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3063     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3064   } else {
3065     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3066   }
3067   if (d_mat) {
3068     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3069     cudaError_t                err;
3070     PetscSplitCSRDataStructure h_mat;
3071     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3072     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3073     if (a->compressedrow.use) {
3074       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3075     }
3076     err = cudaFree(d_mat);CHKERRCUDA(err);
3077   }
3078   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3079   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3080   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3081   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3082   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3083   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3084   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3085   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3086   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3087   PetscFunctionReturn(0);
3088 }
3089 
3090 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3091 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3092 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3093 {
3094   PetscErrorCode ierr;
3095 
3096   PetscFunctionBegin;
3097   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3098   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3099   PetscFunctionReturn(0);
3100 }
3101 
3102 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3103 {
3104   PetscErrorCode     ierr;
3105   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3106   Mat_SeqAIJCUSPARSE *cy;
3107   Mat_SeqAIJCUSPARSE *cx;
3108   PetscScalar        *ay;
3109   const PetscScalar  *ax;
3110   CsrMatrix          *csry,*csrx;
3111 
3112   PetscFunctionBegin;
3113   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3114   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3115   if (X->ops->axpy != Y->ops->axpy) {
3116     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3117     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3118     PetscFunctionReturn(0);
3119   }
3120   /* if we are here, it means both matrices are bound to GPU */
3121   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3122   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3123   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3124   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3125   csry = (CsrMatrix*)cy->mat->mat;
3126   csrx = (CsrMatrix*)cx->mat->mat;
3127   /* see if we can turn this into a cublas axpy */
3128   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3129     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3130     if (eq) {
3131       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3132     }
3133     if (eq) str = SAME_NONZERO_PATTERN;
3134   }
3135   /* spgeam is buggy with one column */
3136   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3137 
3138   if (str == SUBSET_NONZERO_PATTERN) {
3139     cusparseStatus_t stat;
3140     PetscScalar      b = 1.0;
3141 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3142     size_t           bufferSize;
3143     void             *buffer;
3144     cudaError_t      cerr;
3145 #endif
3146 
3147     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3148     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3149     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3150 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3151     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3152                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3153                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3154                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3155     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3156     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3157     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3158                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3159                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3160                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3161     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3162     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3163     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3164 #else
3165     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3166     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3167                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3168                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3169                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3170     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3171     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3172 #endif
3173     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3174     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3175     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3176     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3177   } else if (str == SAME_NONZERO_PATTERN) {
3178     cublasHandle_t cublasv2handle;
3179     cublasStatus_t berr;
3180     PetscBLASInt   one = 1, bnz = 1;
3181 
3182     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3183     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3184     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3185     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3186     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3187     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3188     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3189     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3190     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3191     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3192     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3193   } else {
3194     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3195     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3196   }
3197   PetscFunctionReturn(0);
3198 }
3199 
3200 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3201 {
3202   PetscErrorCode ierr;
3203   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3204   PetscScalar    *ay;
3205   cublasHandle_t cublasv2handle;
3206   cublasStatus_t berr;
3207   PetscBLASInt   one = 1, bnz = 1;
3208 
3209   PetscFunctionBegin;
3210   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3211   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3212   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3213   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3214   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3215   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3216   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3217   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3218   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3219   PetscFunctionReturn(0);
3220 }
3221 
3222 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3223 {
3224   PetscErrorCode             ierr;
3225   PetscBool                  both = PETSC_FALSE;
3226   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3227 
3228   PetscFunctionBegin;
3229   if (A->factortype == MAT_FACTOR_NONE) {
3230     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3231     if (spptr->mat) {
3232       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3233       if (matrix->values) {
3234         both = PETSC_TRUE;
3235         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3236       }
3237     }
3238     if (spptr->matTranspose) {
3239       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3240       if (matrix->values) {
3241         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3242       }
3243     }
3244   }
3245   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3246   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3247   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3248   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3249   else A->offloadmask = PETSC_OFFLOAD_CPU;
3250 
3251   PetscFunctionReturn(0);
3252 }
3253 
3254 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3255 {
3256   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3257   PetscErrorCode ierr;
3258 
3259   PetscFunctionBegin;
3260   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3261   if (flg) {
3262     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3263 
3264     A->ops->scale                     = MatScale_SeqAIJ;
3265     A->ops->axpy                      = MatAXPY_SeqAIJ;
3266     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3267     A->ops->mult                      = MatMult_SeqAIJ;
3268     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3269     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3270     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3271     A->ops->multhermitiantranspose    = NULL;
3272     A->ops->multhermitiantransposeadd = NULL;
3273     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3274     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3275     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3276     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3277     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3278     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3279     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3280     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3281   } else {
3282     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3283     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3284     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3285     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3286     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3287     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3288     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3289     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3290     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3291     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3292     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3293     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3294     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3295     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3296     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3297     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3298     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3299   }
3300   A->boundtocpu = flg;
3301   a->inode.use = flg;
3302   PetscFunctionReturn(0);
3303 }
3304 
3305 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3306 {
3307   PetscErrorCode   ierr;
3308   cusparseStatus_t stat;
3309   Mat              B;
3310 
3311   PetscFunctionBegin;
3312   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3313   if (reuse == MAT_INITIAL_MATRIX) {
3314     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3315   } else if (reuse == MAT_REUSE_MATRIX) {
3316     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3317   }
3318   B = *newmat;
3319 
3320   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3321   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3322 
3323   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3324     if (B->factortype == MAT_FACTOR_NONE) {
3325       Mat_SeqAIJCUSPARSE *spptr;
3326       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3327       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3328       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3329       spptr->format     = MAT_CUSPARSE_CSR;
3330      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3331       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3332       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3333       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3334      #endif
3335       B->spptr = spptr;
3336     } else {
3337       Mat_SeqAIJCUSPARSETriFactors *spptr;
3338 
3339       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3340       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3341       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3342       B->spptr = spptr;
3343     }
3344     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3345   }
3346   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3347   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3348   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3349   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3350   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3351   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3352 
3353   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3354   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3355   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3356   PetscFunctionReturn(0);
3357 }
3358 
3359 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3360 {
3361   PetscErrorCode ierr;
3362 
3363   PetscFunctionBegin;
3364   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3365   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3366   PetscFunctionReturn(0);
3367 }
3368 
3369 /*MC
3370    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3371 
3372    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3373    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3374    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3375 
3376    Options Database Keys:
3377 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3378 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3379 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3380 
3381   Level: beginner
3382 
3383 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3384 M*/
3385 
3386 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3387 
3388 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3389 {
3390   PetscErrorCode ierr;
3391 
3392   PetscFunctionBegin;
3393   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3394   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3395   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3396   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3397   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3398 
3399   PetscFunctionReturn(0);
3400 }
3401 
3402 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3403 {
3404   PetscErrorCode   ierr;
3405   cusparseStatus_t stat;
3406 
3407   PetscFunctionBegin;
3408   if (*cusparsestruct) {
3409     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3410     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3411     delete (*cusparsestruct)->workVector;
3412     delete (*cusparsestruct)->rowoffsets_gpu;
3413     delete (*cusparsestruct)->cooPerm;
3414     delete (*cusparsestruct)->cooPerm_a;
3415     delete (*cusparsestruct)->csr2csc_i;
3416     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3417     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3418   }
3419   PetscFunctionReturn(0);
3420 }
3421 
3422 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3423 {
3424   PetscFunctionBegin;
3425   if (*mat) {
3426     delete (*mat)->values;
3427     delete (*mat)->column_indices;
3428     delete (*mat)->row_offsets;
3429     delete *mat;
3430     *mat = 0;
3431   }
3432   PetscFunctionReturn(0);
3433 }
3434 
3435 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3436 {
3437   cusparseStatus_t stat;
3438   PetscErrorCode   ierr;
3439 
3440   PetscFunctionBegin;
3441   if (*trifactor) {
3442     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3443     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3444     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3445     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3446     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3447    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3448     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3449    #endif
3450     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3451   }
3452   PetscFunctionReturn(0);
3453 }
3454 
3455 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3456 {
3457   CsrMatrix        *mat;
3458   cusparseStatus_t stat;
3459   cudaError_t      err;
3460 
3461   PetscFunctionBegin;
3462   if (*matstruct) {
3463     if ((*matstruct)->mat) {
3464       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3465        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3466         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3467        #else
3468         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3469         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3470        #endif
3471       } else {
3472         mat = (CsrMatrix*)(*matstruct)->mat;
3473         CsrMatrix_Destroy(&mat);
3474       }
3475     }
3476     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3477     delete (*matstruct)->cprowIndices;
3478     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3479     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3480     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3481 
3482    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3483     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3484     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3485     for (int i=0; i<3; i++) {
3486       if (mdata->cuSpMV[i].initialized) {
3487         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3488         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3489         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3490       }
3491     }
3492    #endif
3493     delete *matstruct;
3494     *matstruct = NULL;
3495   }
3496   PetscFunctionReturn(0);
3497 }
3498 
3499 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3500 {
3501   PetscErrorCode ierr;
3502 
3503   PetscFunctionBegin;
3504   if (*trifactors) {
3505     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3506     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3507     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3508     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3509     delete (*trifactors)->rpermIndices;
3510     delete (*trifactors)->cpermIndices;
3511     delete (*trifactors)->workVector;
3512     (*trifactors)->rpermIndices = NULL;
3513     (*trifactors)->cpermIndices = NULL;
3514     (*trifactors)->workVector = NULL;
3515     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3516     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3517     (*trifactors)->init_dev_prop = PETSC_FALSE;
3518   }
3519   PetscFunctionReturn(0);
3520 }
3521 
3522 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3523 {
3524   PetscErrorCode   ierr;
3525   cusparseHandle_t handle;
3526   cusparseStatus_t stat;
3527 
3528   PetscFunctionBegin;
3529   if (*trifactors) {
3530     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3531     if (handle = (*trifactors)->handle) {
3532       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3533     }
3534     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3535   }
3536   PetscFunctionReturn(0);
3537 }
3538 
3539 struct IJCompare
3540 {
3541   __host__ __device__
3542   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3543   {
3544     if (t1.get<0>() < t2.get<0>()) return true;
3545     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3546     return false;
3547   }
3548 };
3549 
3550 struct IJEqual
3551 {
3552   __host__ __device__
3553   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3554   {
3555     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3556     return true;
3557   }
3558 };
3559 
3560 struct IJDiff
3561 {
3562   __host__ __device__
3563   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3564   {
3565     return t1 == t2 ? 0 : 1;
3566   }
3567 };
3568 
3569 struct IJSum
3570 {
3571   __host__ __device__
3572   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3573   {
3574     return t1||t2;
3575   }
3576 };
3577 
3578 #include <thrust/iterator/discard_iterator.h>
3579 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3580 {
3581   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3582   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3583   THRUSTARRAY                           *cooPerm_v = NULL;
3584   thrust::device_ptr<const PetscScalar> d_v;
3585   CsrMatrix                             *matrix;
3586   PetscErrorCode                        ierr;
3587   PetscInt                              n;
3588 
3589   PetscFunctionBegin;
3590   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3591   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3592   if (!cusp->cooPerm) {
3593     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3594     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3595     PetscFunctionReturn(0);
3596   }
3597   matrix = (CsrMatrix*)cusp->mat->mat;
3598   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3599   if (!v) {
3600     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3601     goto finalize;
3602   }
3603   n = cusp->cooPerm->size();
3604   if (isCudaMem(v)) {
3605     d_v = thrust::device_pointer_cast(v);
3606   } else {
3607     cooPerm_v = new THRUSTARRAY(n);
3608     cooPerm_v->assign(v,v+n);
3609     d_v = cooPerm_v->data();
3610     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3611   }
3612   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3613   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3614     if (cusp->cooPerm_a) {
3615       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3616       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3617       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3618       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3619       delete cooPerm_w;
3620     } else {
3621       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3622                                                                 matrix->values->begin()));
3623       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3624                                                                 matrix->values->end()));
3625       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3626     }
3627   } else {
3628     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3629       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3630       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3631     } else {
3632       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3633                                                                 matrix->values->begin()));
3634       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3635                                                                 matrix->values->end()));
3636       thrust::for_each(zibit,zieit,VecCUDAEquals());
3637     }
3638   }
3639   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3640 finalize:
3641   delete cooPerm_v;
3642   A->offloadmask = PETSC_OFFLOAD_GPU;
3643   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3644   /* shorter version of MatAssemblyEnd_SeqAIJ */
3645   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3646   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3647   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3648   a->reallocs         = 0;
3649   A->info.mallocs    += 0;
3650   A->info.nz_unneeded = 0;
3651   A->assembled = A->was_assembled = PETSC_TRUE;
3652   A->num_ass++;
3653   PetscFunctionReturn(0);
3654 }
3655 
3656 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3657 {
3658   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3659   PetscErrorCode     ierr;
3660 
3661   PetscFunctionBegin;
3662   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3663   if (!cusp) PetscFunctionReturn(0);
3664   if (destroy) {
3665     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3666     delete cusp->csr2csc_i;
3667     cusp->csr2csc_i = NULL;
3668   }
3669   A->transupdated = PETSC_FALSE;
3670   PetscFunctionReturn(0);
3671 }
3672 
3673 #include <thrust/binary_search.h>
3674 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3675 {
3676   PetscErrorCode     ierr;
3677   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3678   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3679   PetscInt           cooPerm_n, nzr = 0;
3680   cudaError_t        cerr;
3681 
3682   PetscFunctionBegin;
3683   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3684   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3685   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3686   if (n != cooPerm_n) {
3687     delete cusp->cooPerm;
3688     delete cusp->cooPerm_a;
3689     cusp->cooPerm = NULL;
3690     cusp->cooPerm_a = NULL;
3691   }
3692   if (n) {
3693     THRUSTINTARRAY d_i(n);
3694     THRUSTINTARRAY d_j(n);
3695     THRUSTINTARRAY ii(A->rmap->n);
3696 
3697     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3698     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3699 
3700     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3701     d_i.assign(coo_i,coo_i+n);
3702     d_j.assign(coo_j,coo_j+n);
3703     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3704     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3705 
3706     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3707     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3708     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3709     *cusp->cooPerm_a = d_i;
3710     THRUSTINTARRAY w = d_j;
3711 
3712     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3713     if (nekey == ekey) { /* all entries are unique */
3714       delete cusp->cooPerm_a;
3715       cusp->cooPerm_a = NULL;
3716     } else { /* I couldn't come up with a more elegant algorithm */
3717       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3718       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3719       (*cusp->cooPerm_a)[0] = 0;
3720       w[0] = 0;
3721       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3722       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3723     }
3724     thrust::counting_iterator<PetscInt> search_begin(0);
3725     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3726                         search_begin, search_begin + A->rmap->n,
3727                         ii.begin());
3728     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3729 
3730     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3731     a->singlemalloc = PETSC_FALSE;
3732     a->free_a       = PETSC_TRUE;
3733     a->free_ij      = PETSC_TRUE;
3734     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3735     a->i[0] = 0;
3736     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3737     a->nz = a->maxnz = a->i[A->rmap->n];
3738     a->rmax = 0;
3739     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3740     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3741     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3742     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3743     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3744     for (PetscInt i = 0; i < A->rmap->n; i++) {
3745       const PetscInt nnzr = a->i[i+1] - a->i[i];
3746       nzr += (PetscInt)!!(nnzr);
3747       a->ilen[i] = a->imax[i] = nnzr;
3748       a->rmax = PetscMax(a->rmax,nnzr);
3749     }
3750     a->nonzerorowcnt = nzr;
3751     A->preallocated = PETSC_TRUE;
3752     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3753     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3754   } else {
3755     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3756   }
3757   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3758 
3759   /* We want to allocate the CUSPARSE struct for matvec now.
3760      The code is so convoluted now that I prefer to copy zeros */
3761   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3762   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3763   A->offloadmask = PETSC_OFFLOAD_CPU;
3764   A->nonzerostate++;
3765   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3766   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3767 
3768   A->assembled = PETSC_FALSE;
3769   A->was_assembled = PETSC_FALSE;
3770   PetscFunctionReturn(0);
3771 }
3772 
3773 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3774 {
3775   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3776   CsrMatrix          *csr;
3777   PetscErrorCode     ierr;
3778 
3779   PetscFunctionBegin;
3780   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3781   PetscValidPointer(a,2);
3782   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3783   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3784   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3785   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3786   csr = (CsrMatrix*)cusp->mat->mat;
3787   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3788   *a = csr->values->data().get();
3789   PetscFunctionReturn(0);
3790 }
3791 
3792 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3793 {
3794   PetscFunctionBegin;
3795   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3796   PetscValidPointer(a,2);
3797   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3798   *a = NULL;
3799   PetscFunctionReturn(0);
3800 }
3801 
3802 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3803 {
3804   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3805   CsrMatrix          *csr;
3806   PetscErrorCode     ierr;
3807 
3808   PetscFunctionBegin;
3809   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3810   PetscValidPointer(a,2);
3811   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3812   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3813   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3814   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3815   csr = (CsrMatrix*)cusp->mat->mat;
3816   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3817   *a = csr->values->data().get();
3818   A->offloadmask = PETSC_OFFLOAD_GPU;
3819   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3820   PetscFunctionReturn(0);
3821 }
3822 
3823 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3824 {
3825   PetscErrorCode ierr;
3826 
3827   PetscFunctionBegin;
3828   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3829   PetscValidPointer(a,2);
3830   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3831   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3832   *a = NULL;
3833   PetscFunctionReturn(0);
3834 }
3835 
3836 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3837 {
3838   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3839   CsrMatrix          *csr;
3840   PetscErrorCode     ierr;
3841 
3842   PetscFunctionBegin;
3843   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3844   PetscValidPointer(a,2);
3845   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3846   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3847   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3848   csr = (CsrMatrix*)cusp->mat->mat;
3849   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3850   *a = csr->values->data().get();
3851   A->offloadmask = PETSC_OFFLOAD_GPU;
3852   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3853   PetscFunctionReturn(0);
3854 }
3855 
3856 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3857 {
3858   PetscErrorCode ierr;
3859 
3860   PetscFunctionBegin;
3861   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3862   PetscValidPointer(a,2);
3863   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3864   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3865   *a = NULL;
3866   PetscFunctionReturn(0);
3867 }
3868 
3869 struct IJCompare4
3870 {
3871   __host__ __device__
3872   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3873   {
3874     if (t1.get<0>() < t2.get<0>()) return true;
3875     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3876     return false;
3877   }
3878 };
3879 
3880 struct Shift
3881 {
3882   int _shift;
3883 
3884   Shift(int shift) : _shift(shift) {}
3885   __host__ __device__
3886   inline int operator() (const int &c)
3887   {
3888     return c + _shift;
3889   }
3890 };
3891 
3892 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3893 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3894 {
3895   PetscErrorCode               ierr;
3896   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3897   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3898   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3899   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3900   PetscInt                     Annz,Bnnz;
3901   cusparseStatus_t             stat;
3902   PetscInt                     i,m,n,zero = 0;
3903   cudaError_t                  cerr;
3904 
3905   PetscFunctionBegin;
3906   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3907   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3908   PetscValidPointer(C,4);
3909   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3910   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3911   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3912   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3913   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3914   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3915   if (reuse == MAT_INITIAL_MATRIX) {
3916     m     = A->rmap->n;
3917     n     = A->cmap->n + B->cmap->n;
3918     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3919     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3920     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3921     c     = (Mat_SeqAIJ*)(*C)->data;
3922     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3923     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3924     Ccsr  = new CsrMatrix;
3925     Cmat->cprowIndices      = NULL;
3926     c->compressedrow.use    = PETSC_FALSE;
3927     c->compressedrow.nrows  = 0;
3928     c->compressedrow.i      = NULL;
3929     c->compressedrow.rindex = NULL;
3930     Ccusp->workVector       = NULL;
3931     Ccusp->nrows    = m;
3932     Ccusp->mat      = Cmat;
3933     Ccusp->mat->mat = Ccsr;
3934     Ccsr->num_rows  = m;
3935     Ccsr->num_cols  = n;
3936     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3937     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3938     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3939     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3940     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3941     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3942     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3943     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3944     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3945     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3946     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3947     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3948     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3949     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3950     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3951 
3952     Acsr = (CsrMatrix*)Acusp->mat->mat;
3953     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3954     Annz = (PetscInt)Acsr->column_indices->size();
3955     Bnnz = (PetscInt)Bcsr->column_indices->size();
3956     c->nz = Annz + Bnnz;
3957     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3958     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3959     Ccsr->values = new THRUSTARRAY(c->nz);
3960     Ccsr->num_entries = c->nz;
3961     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3962     if (c->nz) {
3963       auto Acoo = new THRUSTINTARRAY32(Annz);
3964       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3965       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3966       THRUSTINTARRAY32 *Aroff,*Broff;
3967 
3968       if (a->compressedrow.use) { /* need full row offset */
3969         if (!Acusp->rowoffsets_gpu) {
3970           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3971           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3972           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3973         }
3974         Aroff = Acusp->rowoffsets_gpu;
3975       } else Aroff = Acsr->row_offsets;
3976       if (b->compressedrow.use) { /* need full row offset */
3977         if (!Bcusp->rowoffsets_gpu) {
3978           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3979           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3980           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3981         }
3982         Broff = Bcusp->rowoffsets_gpu;
3983       } else Broff = Bcsr->row_offsets;
3984       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3985       stat = cusparseXcsr2coo(Acusp->handle,
3986                               Aroff->data().get(),
3987                               Annz,
3988                               m,
3989                               Acoo->data().get(),
3990                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3991       stat = cusparseXcsr2coo(Bcusp->handle,
3992                               Broff->data().get(),
3993                               Bnnz,
3994                               m,
3995                               Bcoo->data().get(),
3996                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3997       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3998       auto Aperm = thrust::make_constant_iterator(1);
3999       auto Bperm = thrust::make_constant_iterator(0);
4000 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4001       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4002       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4003 #else
4004       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4005       auto Bcib = Bcsr->column_indices->begin();
4006       auto Bcie = Bcsr->column_indices->end();
4007       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4008 #endif
4009       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4010       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4011       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4012       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4013       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4014       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4015       auto p1 = Ccusp->cooPerm->begin();
4016       auto p2 = Ccusp->cooPerm->begin();
4017       thrust::advance(p2,Annz);
4018       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4019 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4020       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4021 #endif
4022       auto cci = thrust::make_counting_iterator(zero);
4023       auto cce = thrust::make_counting_iterator(c->nz);
4024 #if 0 //Errors on SUMMIT cuda 11.1.0
4025       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4026 #else
4027       auto pred = thrust::identity<int>();
4028       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4029       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4030 #endif
4031       stat = cusparseXcoo2csr(Ccusp->handle,
4032                               Ccoo->data().get(),
4033                               c->nz,
4034                               m,
4035                               Ccsr->row_offsets->data().get(),
4036                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4037       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4038       delete wPerm;
4039       delete Acoo;
4040       delete Bcoo;
4041       delete Ccoo;
4042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4043       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4044                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4045                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4046                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4047 #endif
4048       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4049         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4050         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4051         CsrMatrix *CcsrT = new CsrMatrix;
4052         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4053         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4054 
4055         (*C)->form_explicit_transpose = PETSC_TRUE;
4056         (*C)->transupdated = PETSC_TRUE;
4057         Ccusp->rowoffsets_gpu = NULL;
4058         CmatT->cprowIndices = NULL;
4059         CmatT->mat = CcsrT;
4060         CcsrT->num_rows = n;
4061         CcsrT->num_cols = m;
4062         CcsrT->num_entries = c->nz;
4063 
4064         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4065         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4066         CcsrT->values = new THRUSTARRAY(c->nz);
4067 
4068         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4069         auto rT = CcsrT->row_offsets->begin();
4070         if (AT) {
4071           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4072           thrust::advance(rT,-1);
4073         }
4074         if (BT) {
4075           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4076           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4077           thrust::copy(titb,tite,rT);
4078         }
4079         auto cT = CcsrT->column_indices->begin();
4080         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4081         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4082         auto vT = CcsrT->values->begin();
4083         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4084         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4085         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4086 
4087         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4088         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4089         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4090         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4091         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4092         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4093         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4094         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4095         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4096 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4097         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4098                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4099                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4100                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4101 #endif
4102         Ccusp->matTranspose = CmatT;
4103       }
4104     }
4105 
4106     c->singlemalloc = PETSC_FALSE;
4107     c->free_a       = PETSC_TRUE;
4108     c->free_ij      = PETSC_TRUE;
4109     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4110     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4111     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4112       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4113       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4114       ii   = *Ccsr->row_offsets;
4115       jj   = *Ccsr->column_indices;
4116       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4117       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4118     } else {
4119       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4120       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4121     }
4122     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4123     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4124     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4125     c->maxnz = c->nz;
4126     c->nonzerorowcnt = 0;
4127     c->rmax = 0;
4128     for (i = 0; i < m; i++) {
4129       const PetscInt nn = c->i[i+1] - c->i[i];
4130       c->ilen[i] = c->imax[i] = nn;
4131       c->nonzerorowcnt += (PetscInt)!!nn;
4132       c->rmax = PetscMax(c->rmax,nn);
4133     }
4134     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4135     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4136     (*C)->nonzerostate++;
4137     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4138     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4139     Ccusp->nonzerostate = (*C)->nonzerostate;
4140     (*C)->preallocated  = PETSC_TRUE;
4141   } else {
4142     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4143     c = (Mat_SeqAIJ*)(*C)->data;
4144     if (c->nz) {
4145       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4146       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4147       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4148       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4149       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4150       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4151       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4152       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4153       Acsr = (CsrMatrix*)Acusp->mat->mat;
4154       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4155       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4156       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4157       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4158       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4159       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4160       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4161       auto pmid = Ccusp->cooPerm->begin();
4162       thrust::advance(pmid,Acsr->num_entries);
4163       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4164       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4165                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4166       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4167                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4168       thrust::for_each(zibait,zieait,VecCUDAEquals());
4169       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4170                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4171       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4172                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4173       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4174       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4175       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4176         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4177         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4178         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4179         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4180         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4181         auto vT = CcsrT->values->begin();
4182         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4183         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4184         (*C)->transupdated = PETSC_TRUE;
4185       }
4186       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4187     }
4188   }
4189   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4190   (*C)->assembled     = PETSC_TRUE;
4191   (*C)->was_assembled = PETSC_FALSE;
4192   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4193   PetscFunctionReturn(0);
4194 }
4195 
4196 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4197 {
4198   PetscErrorCode    ierr;
4199   bool              dmem;
4200   const PetscScalar *av;
4201   cudaError_t       cerr;
4202 
4203   PetscFunctionBegin;
4204   dmem = isCudaMem(v);
4205   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4206   if (n && idx) {
4207     THRUSTINTARRAY widx(n);
4208     widx.assign(idx,idx+n);
4209     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4210 
4211     THRUSTARRAY *w = NULL;
4212     thrust::device_ptr<PetscScalar> dv;
4213     if (dmem) {
4214       dv = thrust::device_pointer_cast(v);
4215     } else {
4216       w = new THRUSTARRAY(n);
4217       dv = w->data();
4218     }
4219     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4220 
4221     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4222     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4223     thrust::for_each(zibit,zieit,VecCUDAEquals());
4224     if (w) {
4225       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4226     }
4227     delete w;
4228   } else {
4229     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4230   }
4231   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4232   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4233   PetscFunctionReturn(0);
4234 }
4235